In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import joblib


In [3]:
# Loading the dataset
file_path = 'credit_scores.csv'  
df = pd.read_csv(file_path)


In [4]:
# Deleting the specified features
columns_to_delete = ["Name", "SSN", "ID", "Customer_ID"]
df.drop(columns=columns_to_delete, inplace=True)

In [5]:
# Setting the Credit_Score feature as the target variable
y = df['Credit_Score']
X = df.drop(columns=['Credit_Score'])

In [6]:
# Identifying numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [7]:
# Filling missing values in numerical columns with mean
imputer_num = SimpleImputer(strategy='mean')
X[numerical_cols] = imputer_num.fit_transform(X[numerical_cols])

In [8]:
# Scaling numerical columns using StandardScaler
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [9]:
# Filling missing values in categorical columns with the most frequent value
imputer_cat = SimpleImputer(strategy='most_frequent')
X[categorical_cols] = imputer_cat.fit_transform(X[categorical_cols])

In [10]:
# Converting categorical columns to numerical using LabelEncoder
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

In [11]:
# Verifying that no categorical data remains (all should be numerical now)
print(X.dtypes)

Month                              int32
Age                              float64
Occupation                         int32
Annual_Income                    float64
Monthly_Inhand_Salary            float64
Num_Bank_Accounts                float64
Num_Credit_Card                  float64
Interest_Rate                    float64
Delay_from_due_date              float64
Num_of_Delayed_Payment           float64
Changed_Credit_Limit             float64
Num_Credit_Inquiries             float64
Credit_Mix                         int32
Outstanding_Debt                 float64
Credit_Utilization_Ratio         float64
Credit_History_Age               float64
Payment_of_Min_Amount              int32
Total_EMI_per_month              float64
Amount_invested_monthly          float64
Payment_Behaviour                  int32
Monthly_Balance                  float64
Count_Auto Loan                  float64
Count_Credit-Builder Loan        float64
Count_Personal Loan              float64
Count_Home Equit

In [12]:
# Dividing the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
# Developing the SVM model and fine-tune hyperparameters using GridSearchCV
param_grid = {
    'kernel': ['rbf'],  
    'C': [0.01, 10, 20]
}

svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [16]:
# Reporting the accuracy of the best model
best_model = grid_search.best_estimator_
accuracy = grid_search.best_score_
print("Best Model Parameters:", best_model)
print("Best Accuracy:", accuracy)

Best Model Parameters: SVC(C=20)
Best Accuracy: 0.6762095850394627


In [17]:
# Retraining the best model using the whole dataset
best_model.fit(X_train, y_train)


In [18]:
   # Saving the best model as a file
joblib.dump(best_model, 'credit scores best model')

['credit scores best model']