In [3]:
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [4]:
df = pd.read_csv('credit_score.csv')

In [5]:
# Prepare data for feature selection
# We'll consider only numerical columns for now as chi-squared test requires numerical features and labels
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Exclude columns that are identifiers or may not be relevant for predicting 'Type_of_Loan'
# We can't use 'ID' or 'Customer_ID' as they are merely identifiers and not features
# 'Month' is also excluded as it is a time-based identifier
relevant_cols = [col for col in numerical_cols if col not in ['Month']]

# Prepare the training set for feature selection (exclude rows with 'No Data' in 'Type_of_Loan')
df_for_feature_selection = df[df['Type_of_Loan'] != 'No Data']

X = df_for_feature_selection[relevant_cols]  # Feature matrix
y = df_for_feature_selection['Type_of_Loan']  # Target variable

# Apply SelectKBest class to extract top features using Chi-squared test
bestfeatures = SelectKBest(score_func=chi2, k='all')

fit = bestfeatures.fit(X, y)

# Get the scores for each feature
feature_scores = pd.DataFrame({'Feature': relevant_cols, 'Score': fit.scores_}).sort_values(by='Score', ascending=False)

# Display the feature scores


In [6]:
selected_features = feature_scores[feature_scores['Score'] > 1e5]['Feature'].tolist()

# Prepare the dataset
# Exclude rows with 'No Data' in 'Type_of_Loan' for the training set
train_data = df[df['Type_of_Loan'] != 'No Data']
# Include only rows with 'No Data' in 'Type_of_Loan' for the prediction set
predict_data = df[df['Type_of_Loan'] == 'No Data']

# Select only the chosen features and the target for training
X_train_full = train_data[selected_features]
y_train_full = train_data['Type_of_Loan']

# Label encode the target variable
label_encoder = LabelEncoder()
y_train_full_encoded = label_encoder.fit_transform(y_train_full)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full_encoded, test_size=0.2, random_state=42)

# Prepare the prediction set
X_predict = predict_data[selected_features]

# Show the shape of the data to confirm


In [7]:


# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model on the training set
rf_classifier.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = rf_classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

# Display the validation accuracy
val_accuracy

: 

In [None]:
#Predict the 'Type_of_Loan' for the prediction set
y_predict_encoded = rf_classifier.predict(X_predict)

# Decode the predicted labels back to original categories
y_predict = label_encoder.inverse_transform(y_predict_encoded)

In [None]:
#Impute the predicted 'Type_of_Loan' values into the original dataset
df.loc[df['Type_of_Loan'] == 'No Data', 'Type_of_Loan'] = y_predict
df.to_csv('../csv/no_data.csv', index=False)