In [1]:
# Import necessary libraries
import pandas as pd  # For data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier  # For the random forest classifier
from sklearn.model_selection import train_test_split  # To split the data into training and testing sets
from sklearn.metrics import accuracy_score  # To measure the accuracy of the model


In [2]:
# Define a function to encode Credit_Score variables into numerical values using a mapping dictionary.
def custom_encoding(y):
    """
    Encode Credit_Score variables into numerical values using a mapping dictionary.
    
    Parameters:
    y (Series): A pandas Series containing the Credit_Score variables to be encoded.
    
    Returns:
    Series: A new Series with encoded numerical values.
    """
    mapping = {
        "Standard": 1,
        "Good": 2,
        "Poor": 0,
        "Bad": 0
    }
    y = y.map(mapping)
    return y


In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv('credit_score_clean_extraction.csv')  # Read the CSV file

# Remove the target variable and store the remaining columns in 'x'
x = df.drop(['Credit_Score'], axis=1)  # Drop the target variable

# Store all the column names in a variable 'columns'
columns = x.columns

# Encode the target variable using the 'custom_encoding' function
y = custom_encoding(df["Credit_Score"])  # Encode the target variable


In [4]:
# Define the list of selected features
features = [
    "Outstanding_Debt",
    "Credit_Mix",
    "Credit_History_Age",
]

# Select the feature matrix 'X' using the specified features
X = x[features]

# Split the data into training and testing sets with a test size of 20% and a random state of 42
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest Classifier with 100 estimators and a random state of 42
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rf.fit(x_train, y_train)

# Predict the target variable on the test set
y_pred = rf.predict(x_test)

# Print the accuracy score of the model
print("Accuracy:", accuracy_score(y_test, y_pred))

# Store the accuracy score in a variable 'score'
score = accuracy_score(y_test, y_pred)


Accuracy: 0.6869


The code performs active feature selection using a nested loop. It iterates through all possible combinations of features and trains a random forest classifier on each combination. It selects the feature that improves the accuracy of the classifier the most and adds it to the list of selected features. The loop continues until all features have been added or until a feature does not improve the accuracy of the classifier. The code could be improved by using more efficient feature selection methods and classifiers.

In [5]:
# Iterate through all the features
columns = x.columns
for j in columns:
    for i in columns:
        if i not in features:  # Check if the feature is not in the current features list
            # Create a new feature matrix by adding the current feature to the existing features
            X = x[features + [i]]
            
            # Split the data into training and testing sets with a test size of 20% and a random state of 42
            x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Initialize a Random Forest Classifier with 100 estimators and a random state of 42
            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            
            # Fit the model to the training data
            rf.fit(x_train, y_train)
            
            # Predict the target variable on the test set
            y_pred = rf.predict(x_test)
            
            # Calculate the new accuracy score
            new_score = accuracy_score(y_test, y_pred)
            
            # Compare the new score with the previous score
            if new_score > score:
                new_feature = i  # Store the new feature
                score = new_score  # Update the score
                
    if new_feature in features:  # Check if the new feature is already in the features list
        break
    
    features.append(new_feature)  # Append the new feature to the features list


In [7]:
# Print the top features that have the most impact on model accuracy
print("Top features:", features)


Top features: ['Outstanding_Debt', 'Credit_Mix', 'Credit_History_Age', 'Amount_invested_monthly']
