<a href="https://colab.research.google.com/github/yeagernolan19/Final_project/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
dtypes = {

    # numeric columns containing corrupted values
    'Age':'object',
    'Num_of_Loan':'object',
    'Num_of_Delayed_Payment':'object',
    'Annual_Income':'object',
    'Outstanding_Debt':'object',
    'Amount_invested_monthly':'object',
    'Monthly_Balance':'object',

    # numeric columns with descrete values
    'Num_Bank_Accounts':pd.Int16Dtype(),
    'Num_Credit_Card':pd.Int16Dtype(),
    'Interest_Rate':pd.Int16Dtype(),

    # numeric columns with continues values
    'Delay_from_due_date':pd.Float32Dtype(),
    'Monthly_Inhand_Salary':pd.Float32Dtype(),
    'Changed_Credit_Limit':pd.Float32Dtype(),
    'Num_Credit_Inquiries':pd.Float32Dtype(),
    'Credit_Utilization_Ratio':pd.Float32Dtype(),
    'Total_EMI_per_month':pd.Float32Dtype(),

    # categorical columns
    'ID':'category',
    'Customer_ID':'category',
    'Month':'category',
    'Name':'category',
    'Occupation':'category',
    'Payment_of_Min_Amount':'category',

    # columns that need further processing
    'Payment_Behaviour':'string',
    'Credit_History_Age':'string',
    'SSN':'string',
    'Type_of_Loan':'string'
}

missing_values = ['_______', '_', '!@9#%8', '#F%$D@*&8']



In [None]:
df_train=pd.read_csv('Resources/train.csv',  na_values = missing_values, dtype = dtypes)
df_train

In [None]:
col_names=df_train.columns
col_names

In [None]:
columns_to_drop = ["Name", "Age", "Month", "Occupation",'ID', 'SSN','Customer_ID']
df_train.drop(columns=columns_to_drop, inplace=True)
df_train

In [None]:
def odd_value(col):
    df_train[col]=df_train[col].str.strip('_')
    df_train[col]=df_train[col].replace(['!@9#%8','', 'nan'],np.NaN)
cols=list(df_train.select_dtypes(include='O').columns)
list(map(odd_value,cols))

In [None]:
def change_dtype(col,dtype):
    df_train[col]=df_train[col].astype(dtype)

required_features=['Annual_Income','Num_of_Loan','Delay_from_due_date','Num_of_Delayed_Payment','Changed_Credit_Limit','Outstanding_Debt','Amount_invested_monthly','Monthly_Balance']
for col in required_features:
    change_dtype(col,'float64')

df_train.info()

In [None]:
print(df_train.columns)

In [None]:
encoding_variables=['Credit_Mix','Payment_of_Min_Amount','Payment_Behaviour']
def frequency_encoding(df_train, column):
    freq_map = df_train[column].value_counts(normalize=True).to_dict()
    df_train[column + '_freq_enc'] = df_train[column].map(freq_map)
for column in encoding_variables:
    frequency_encoding(df_train, column)
df_train['Credit_Mix']=df_train['Credit_Mix_freq_enc']
df_train['Payment_of_Min_Amount']=df_train['Payment_of_Min_Amount_freq_enc']
df_train['Payment_Behaviour']=df_train['Payment_Behaviour_freq_enc']
df_train.drop(columns=['Credit_Mix_freq_enc','Payment_of_Min_Amount_freq_enc','Payment_Behaviour_freq_enc'],inplace=True)
df_train

In [None]:
if 'Type_of_Loan' in df_train.columns:
    df_train.drop(columns=['Type_of_Loan'], inplace=True)
else:
    print("Column 'Type_of_Loan' not found.")

In [None]:
df_train.dropna(inplace=True)
df_train.drop(columns=['Credit_History_Age'], inplace=True)

In [None]:
X = df_train.drop(columns=['Credit_Score'])
y = df_train['Credit_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
df_test=pd.read_csv('Resources/test.csv',  na_values = missing_values, dtype = dtypes)
df_test

In [None]:
columns_to_drop = ["Name", "Age", "Month", "Occupation",'ID', 'SSN','Customer_ID']
df_test.drop(columns=columns_to_drop, inplace=True)
df_test

In [None]:
def odd_value(col):
    df_test[col]=df_test[col].str.strip('_')
    df_test[col]=df_test[col].replace(['!@9#%8','', 'nan'],np.NaN)
cols=list(df_test.select_dtypes(include='O').columns)
list(map(odd_value,cols))

In [None]:
def change_dtype(col,dtype):
    df_test[col]=df_train[col].astype(dtype)

required_features=['Annual_Income','Num_of_Loan','Delay_from_due_date','Num_of_Delayed_Payment','Changed_Credit_Limit','Outstanding_Debt','Amount_invested_monthly','Monthly_Balance']
for col in required_features:
    change_dtype(col,'float64')

df_test.info()

In [None]:
df_test.dropna(inplace=True)
df_test.drop(columns=['Credit_History_Age'], inplace=True)
df_test

In [None]:
encoding_variables=['Credit_Mix','Payment_of_Min_Amount','Payment_Behaviour']
def frequency_encoding(df_test, column):
    freq_map = df_test[column].value_counts(normalize=True).to_dict()
    df_test[column + '_freq_enc'] = df_test[column].map(freq_map)
for column in encoding_variables:
    frequency_encoding(df_test, column)
df_test['Credit_Mix']=df_test['Credit_Mix_freq_enc']
df_test['Payment_of_Min_Amount']=df_test['Payment_of_Min_Amount_freq_enc']
df_test['Payment_Behaviour']=df_test['Payment_Behaviour_freq_enc']
df_test.drop(columns=['Credit_Mix_freq_enc','Payment_of_Min_Amount_freq_enc','Payment_Behaviour_freq_enc'],inplace=True)
df_test

In [None]:
print(df_test.columns)

In [None]:
df_test

In [None]:
if 'Type_of_Loan' in df_test.columns:
    df_test.drop(columns=['Type_of_Loan'], inplace=True)

In [None]:
df_test['Credit_Score'] = 0
df_test

In [None]:
X_test_scaled = scaler.transform(X_test)
predicted_scores = model.predict(X_test_scaled)
df_test_copy = df_test.copy().reset_index(drop=True)
if len(predicted_scores) == len(df_test_copy):
    df_test_copy['Credit_Score'] = pd.to_numeric(predicted_scores, errors='coerce')
    df_test_copy = df_test_copy.dropna(subset=['Credit_Score'])
    percentile_25 = df_test_copy['Credit_Score'].quantile(0.25)
    percentile_75 = df_test_copy['Credit_Score'].quantile(0.75)
    threshold_poor = percentile_25
    threshold_good = percentile_75

    def label_category(score):
        if score < threshold_poor:
            return 'Poor'
        elif threshold_poor <= score < threshold_good:
            return 'Standard'
        else:
            return 'Good'
    df_test_copy['Credit_Category'] = df_test_copy['Credit_Score'].apply(label_category)
else:
    print("Length of 'predicted_scores' does not match the length of 'df_test_copy'.")

In [None]:
print("X_test_scaled shape:",X_test_scaled.shape)
print("predicted_scores shape:", predicted_scores.shape)

In [None]:
print(df_test_copy.head())

In [None]:
X_test_scaled = scaler.transform(X_test)
predicted_scores = model.predict(X_test_scaled)
df_test_copy = df_test.copy().reset_index(drop=True)
if len(predicted_scores) == len(df_test_copy):
    predicted_scores = predicted_scores.astype(int)
    df_test_copy['Credit_Score'] = predicted_scores
    df_test_copy = df_test_copy.dropna(subset=['Credit_Score'])
    percentile_25 = df_test_copy['Credit_Score'].quantile(0.25)
    percentile_75 = df_test_copy['Credit_Score'].quantile(0.75)
    threshold_poor = percentile_25
    threshold_good = percentile_75
    def label_category(score):
        if score < threshold_poor:
            return 'Poor'
        elif threshold_poor <= score < threshold_good:
            return 'Standard'
        else:
            return 'Good'
    df_test_copy['Credit_Category'] = df_test_copy['Credit_Score'].apply(label_category)
df_test_copy
