In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

In [2]:
# Load the dataset
file_path = '/kaggle/input/credit-scores/credit_scores.csv'
df = pd.read_csv(file_path)

# Delete unnecessary features
df.drop(columns=['Name', 'SSN', 'ID', 'Customer_ID'], inplace=True)

In [10]:
# Display the first few rows of the dataset
print(df.head())

# Display the column names and data types
print(df.info())

      Month   Age Occupation  Annual_Income  Monthly_Inhand_Salary  \
0      July  23.0  Scientist       19114.12            1824.843333   
1  February  28.0    Teacher       34847.84            3037.986667   
2       May  28.0    Teacher       34847.84            3037.986667   
3      June  28.0    Teacher       34847.84            3037.986667   
4    August  28.0    Teacher       34847.84            3037.986667   

   Num_Bank_Accounts  Num_Credit_Card  Interest_Rate  Delay_from_due_date  \
0                3.0              4.0            3.0                  3.0   
1                2.0              4.0            6.0                  7.0   
2                2.0              4.0            6.0                  3.0   
3                2.0              4.0            6.0                  3.0   
4                2.0              4.0            6.0                  3.0   

   Num_of_Delayed_Payment  ...  Credit_Score  Count_Auto Loan  \
0                       8  ...          Good       

In [3]:
# Set target and features
target = 'Credit_Score'
X = df.drop(columns=[target])
y = df[target]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [4]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Define preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [5]:
# Fit and transform the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [6]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the model
svm = SVC()

# Define the parameter grid
param_grid = {
    'kernel': ['rbf'],
    'C': [0.02, 5, 30]
}

# Perform grid search
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy' )
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_


In [7]:
from sklearn.metrics import accuracy_score

# Predict on test set
y_pred = best_model.predict(X_test)

# Report the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the best model: {accuracy}')


Accuracy of the best model: 0.7007932944170034


In [8]:
# Retrain the best model on the entire dataset
X_processed = preprocessor.fit_transform(X)
best_model.fit(X_processed, y)

# Save the model
joblib.dump(best_model, 'best_svm_model.pkl')


['best_svm_model.pkl']