In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB

In [2]:
# Load training and test data
train_data = pd.read_csv(r'./train.csv', low_memory=False)
test_data = pd.read_csv(r'./test.csv', low_memory=False)
test_ids = pd.read_csv(r'./test.csv', low_memory=False)['ID']

In [3]:
# Drop unnecessary columns
train_data.drop(columns=['Unnamed: 0', 'ID', 'Customer_ID', 'Month', 'Name', 'Number'], inplace=True, errors='ignore')
test_data.drop(columns=['ID', 'Customer_ID', 'Month', 'Name', 'Number'], inplace=True, errors='ignore')

In [4]:
# Convert relevant columns to numeric after removing any underscores
for col in ['Current_Debt_Outstanding', 'Income_Annual', 'Credit_Limit', 'Age']:
    train_data[col] = pd.to_numeric(train_data[col].astype(str).str.replace('_', '', regex=False), errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col].astype(str).str.replace('_', '', regex=False), errors='coerce')

In [5]:
# Fill missing values with median in train data
train_data.fillna(train_data.median(numeric_only=True), inplace=True)

# Feature engineering in train data
train_data['Debt_Income_Ratio'] = train_data['Current_Debt_Outstanding'] / train_data['Income_Annual']
train_data['Income_Credit_Limit_Ratio'] = train_data['Income_Annual'] / train_data['Credit_Limit']
train_data['Debt_Credit_Limit_Ratio'] = train_data['Current_Debt_Outstanding'] / train_data['Credit_Limit']

In [6]:
# Replace infinity values in train data
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
train_data.fillna(train_data.median(numeric_only=True), inplace=True)

# Label encode the target variable in train data
label_encoder = LabelEncoder()
train_data['Credit_Score'] = label_encoder.fit_transform(train_data['Credit_Score'])

In [7]:
# Prepare training features and labels
X_train = train_data.drop(columns='Credit_Score')
y_train = train_data['Credit_Score']

In [8]:
# Identify numerical and categorical columns in train data
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [9]:
# Define preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Create column transformer for preprocessing
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

In [11]:
# Prepare test data
test_data.fillna(train_data.median(numeric_only=True), inplace=True)

In [12]:
# Feature engineering in test data
test_data['Debt_Income_Ratio'] = test_data['Current_Debt_Outstanding'] / test_data['Income_Annual']
test_data['Income_Credit_Limit_Ratio'] = test_data['Income_Annual'] / test_data['Credit_Limit']
test_data['Debt_Credit_Limit_Ratio'] = test_data['Current_Debt_Outstanding'] / test_data['Credit_Limit']

In [13]:
# Replace infinity values in test data
test_data.replace([np.inf, -np.inf], np.nan, inplace=True)
test_data.fillna(train_data.median(numeric_only=True), inplace=True)

In [14]:
print(X_train.columns)
print(y_train)

Index(['Age', 'Profession', 'Income_Annual', 'Base_Salary_PerMonth',
       'Total_Bank_Accounts', 'Total_Credit_Cards', 'Rate_Of_Interest',
       'Total_Current_Loans', 'Loan_Type', 'Delay_from_due_date',
       'Total_Delayed_Payments', 'Credit_Limit', 'Total_Credit_Enquiries',
       'Credit_Mix', 'Current_Debt_Outstanding', 'Ratio_Credit_Utilization',
       'Credit_History_Age', 'Payment_of_Min_Amount', 'Per_Month_EMI',
       'Monthly_Investment', 'Payment_Behaviour', 'Monthly_Balance',
       'Debt_Income_Ratio', 'Income_Credit_Limit_Ratio',
       'Debt_Credit_Limit_Ratio'],
      dtype='object')
0        2
1        2
2        2
3        2
4        1
        ..
79995    1
79996    1
79997    1
79998    2
79999    2
Name: Credit_Score, Length: 80000, dtype: int64


In [None]:
models = {}
model = RandomForestClassifier(n_estimators=300, max_depth=6, random_state=42)
models['Random forest'] = model
model = XGBClassifier(learning_rate=0.05, max_depth=6, n_estimators=300, random_state=42, eval_metric='mlogloss')
models['XGBoost'] = model
model = KNeighborsClassifier(n_neighbors=5)
models['K nearest neighbours'] = model
model = LogisticRegression(random_state=42, max_iter=500)
models['Logistic regression'] = model
model = DecisionTreeClassifier(max_depth=6, random_state=42)
models['Decision tree classifier'] = model
model = GaussianNB()
models['Gaussian'] = model
base_estimator = DecisionTreeClassifier(max_depth=6)
model = AdaBoostClassifier(estimator=base_estimator, n_estimators=300, random_state=42)
models['Adaboost'] = model

In [None]:
for key, value in models.items():
    try:
        # Define model pipeline
        pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', value)])

        # Fit the pipeline on training data
        pipeline.fit(X_train, y_train)
        # Make predictions on test data
        test_predictions = pipeline.predict(test_data)


        # Convert predictions back to original labels
        test_predictions_labels = label_encoder.inverse_transform(test_predictions)
        # test_predictions_encoded = label_encoder.transform(test_predictions)
        # Prepare the submission file
        submission = pd.DataFrame({'ID': test_ids, 'Credit_Score': test_predictions_labels})
        submission.to_csv(f'submission_{key}.csv', index=False)

        print(f"Submission file 'submission_{key}.csv' created successfully!")
    except Exception as e:
        print(f'Error : {e}')