## Exercise 9: Choosing the best performing model on a dataset

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
- Use all classification models

Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e10/overview



In [None]:
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## Dataset File

In [None]:
dataset_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/train.csv?raw=true'
df = pd.read_csv(dataset_url)

## Test File

In [None]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/test.csv?raw=true'
dt=pd.read_csv(test_url)

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


## Sample Submission File

In [None]:
sample_submission_url ='https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [None]:
sf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           39098 non-null  int64  
 1   loan_status  39098 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 611.0 KB


In [None]:
score_list = {}

## 1. Train a KNN Classifier

In [None]:
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [None]:
df.isna().sum()

Unnamed: 0,0
id,0
person_age,0
person_income,0
person_home_ownership,0
person_emp_length,0
loan_intent,0
loan_grade,0
loan_amnt,0
loan_int_rate,0
loan_percent_income,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [None]:
X = df.drop(columns=['loan_status']).values

In [None]:
y = df['loan_status'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(41051, 12)
(41051,)
(17594, 12)
(17594,)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for column in df.columns:

    if df[column].dtype == 'object':
        encoder.fit(df[column])
        df[column] = encoder.transform(df[column])

X = df.drop(columns=['loan_status']).values
y = df['loan_status'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
import sklearn.metrics as metrics
metrics.accuracy_score(y_test, y_pred)

0.8855859952256451

- Perform cross validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn, X, y, cv=5)
scores


array([0.86827522, 0.86929832, 0.87697161, 0.87432859, 0.87484014])

In [None]:
print("%0.2f Accuracy" % (scores.mean()))
print("%0.2f Standard Deviation " % scores.std())
score_list['knn'] = scores.mean()

0.87 Accuracy
0.00 Standard Deviation 


## 2. Train a Logistic Regression Classifier

In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- Perform cross validation

In [None]:
scores = cross_val_score(lr, X, y, cv=5)
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.88123455, 0.88362179, 0.88541223, 0.88328076, 0.88404809])

In [None]:
print("%0.2f Accuracy" % (scores.mean()))
print("%0.2f Standard Deviation " % scores.std())
score_list['lr'] = scores.mean()

0.88 Accuracy
0.00 Standard Deviation 


## 3. Train a Naive Bayes Classifier

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

- Perform cross validation

In [None]:
scores = cross_val_score(nb, X, y, cv=5)
scores

array([0.87620428, 0.88089351, 0.88293972, 0.88336602, 0.88549748])

In [None]:
print("%0.2f Accuracy" % (scores.mean()))
print("%0.2f Standard Deviation " % scores.std())
score_list['nb'] = scores.mean()

0.88 Accuracy
0.00 Standard Deviation 


## 4. Train a SVM Classifier

In [None]:
svm = SVC(random_state=42)
svm.fit(X_train, y_train)

- Perform cross validation

In [None]:
scores = cross_val_score(svm, X, y, cv=5)
scores

array([0.85761787, 0.85761787, 0.85761787, 0.85761787, 0.85761787])

In [None]:
print("%0.2f Accuracy" % (scores.mean()))
print("%0.2f Standard Deviation " % scores.std())
score_list.append(('svm', scores.mean()))

0.86 Accuracy
0.00 Standard Deviation 


## 5. Train a Decision Tree Classifier

In [None]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

- Perform cross validation

In [None]:
scores = cross_val_score(dtc, X, y, cv=5)
scores

array([0.16804502, 0.89274448, 0.89112456, 0.92045358, 0.83272231])

In [None]:
print("%0.2f Accuracy" % (scores.mean()))
print("%0.2f Standard Deviation " % scores.std())
score_list.append(('dtc', scores.mean()))

0.74 Accuracy
0.29 Standard Deviation 


## 6. Train a Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [None]:
scores = cross_val_score(rfc, X, y, cv=5)
scores

array([0.27061131, 0.94875949, 0.94918578, 0.95106147, 0.94927104])

In [None]:
print("%0.2f Accuracy" % (scores.mean()))
print("%0.2f Standard Deviation " % scores.std())
score_list.append(('rfc', scores.mean()))

0.81 Accuracy
0.27 Standard Deviation 


## 7. Compare all the performance of all classification models

In [None]:
score_dict = dict(score_list)

for model_name, score in score_list:
    print(f"{model_name}: {score}")

knn: 0.8727427743200613
lr: 0.8817802029158497
nb: 0.8817802029158497
svm: 0.8576178702361668
dtc: 0.7410179895984312
rfc: 0.8137778156705602


## 9. Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [None]:
dt.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


In [None]:
dt.isna().sum()

Unnamed: 0,0
id,0
person_age,0
person_income,0
person_home_ownership,0
person_emp_length,0
loan_intent,0
loan_grade,0
loan_amnt,0
loan_int_rate,0
loan_percent_income,0


In [None]:
for column in dt.columns:
    if dt[column].dtype == 'object':
        encoder.fit(dt[column])
        dt[column] = encoder.transform(dt[column])

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  int64  
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  int64  
 6   loan_grade                  39098 non-null  int64  
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  int64  
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 3.6 MB


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  int64  
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  int64  
 6   loan_grade                  58645 non-null  int64  
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  int64  
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(10)
memory

In [None]:
# id = sf.pop('id')
y_pred = rfc.predict(dt)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'loan_status': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")



Submission file created: submission_file.csv
