In [1]:
# import libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier # for random forest classifier
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.metrics import accuracy_score # to measure how good we are


In [2]:
# encodes Credit_Score variables into numerical values using a mapping dictionary.
def custom_encoding(y):
    mapping = {"Standard":1,"Good":2,"Poor":0,"Bad":0}
    y = y.map(mapping)
    return y
    

In [3]:
df = pd.read_csv('credit_score_clean_extraction.csv') # read the csv file
x = df.drop(['Credit_Score'],axis=1) # drop the target variable
column = x.columns # store all the column names in a variable
y = custom_encoding(df["Credit_Score"]) # encode the target variable

In [13]:
features=[
      "Outstanding_Debt",
      "Credit_Mix",
      "Credit_History_Age",
    ]

X = x[features] # store the feature matrix
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42) # split the data
rf = RandomForestClassifier(n_estimators=100,random_state=42) # random forest classifier with n_estimators=100
rf.fit(x_train,y_train) # fit the model
y_pred = rf.predict(x_test) # predict on the test set
print("Accuracy:",accuracy_score(y_test,y_pred)) # print accuracy score
score = accuracy_score(y_test,y_pred) # store accuracy score in a variable


Accuracy: 0.6869


The code performs active feature selection using a nested loop. It iterates through all possible combinations of features and trains a random forest classifier on each combination. It selects the feature that improves the accuracy of the classifier the most and adds it to the list of selected features. The loop continues until all features have been added or until a feature does not improve the accuracy of the classifier. The code could be improved by using more efficient feature selection methods and classifiers.

In [None]:
columns = x.columns
for j in column: # iterate through all the features
    for i in column: # iterate through all the features
        if i not in features: # check if the feature is not in the features list
            X = x[features+[i]] # store the feature matrix
            x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42) # split the data
            rf = RandomForestClassifier(n_estimators=100,random_state=42) # random forest classifier with n_estimators=100
            rf.fit(x_train,y_train) # fit the model
            y_pred = rf.predict(x_test) # predict on the test set
            new_score = accuracy_score(y_test,y_pred) # get the new accuracy score
            if new_score > score: # compare the new score with the previous score
                new_feature = i # store the new feature
                score = new_score # store the new score
                
    
    if new_feature in features: # check if the new feature is in the features list
        break
    features.append(new_feature) # append the new feature to the features list
    
    
            

In [16]:
print("Top features:",features) # print the top features

Top features: ['Credit_Mix', 'Total_Financial_Obligations', 'Outstanding_Debt', 'Credit_History_Age', 'Interest_Loan_Interaction']
