**Credit score Classification**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/Datasets/credit_score.csv")
df

In [None]:
df.columns

In [None]:
df["Month"].value_counts()

In [None]:
df.isna().sum()

In [None]:
df.drop(["Month","ID","Customer_ID","Name","SSN"],axis=1,inplace=True)

In [None]:
df.dtypes

In [None]:
le=LabelEncoder()
features=["Occupation","Type_of_Loan","Credit_Mix","Payment_of_Min_Amount","Payment_Behaviour","Credit_Score"]
for i in features:
  df[i]=le.fit_transform(df[i])
df

In [None]:
X=df.iloc[:,:-1]
X

In [None]:
y=df.iloc[:,-1]
y

In [None]:

scaler=MinMaxScaler()
X_scaled=scaler.fit_transform(X)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=1)

In [None]:
knn=KNeighborsClassifier()
sv=SVC()
nb=GaussianNB()
dt=DecisionTreeClassifier()
rf=RandomForestClassifier(random_state=1)
ab=AdaBoostClassifier(random_state=1)

In [None]:
accuracy_scores=[]
models=[knn,sv,nb,dt,rf,ab]
for i in models:
  print("*****************",i,"**********************")
  i.fit(X_train,y_train)
  y_pred=i.predict(X_test)
  accuracy=(accuracy_score(y_test,y_pred))*100
  accuracy_scores.append(accuracy)
  print(classification_report(y_test,y_pred))

In [None]:
x=["knn","sv","nb","dt","rf","ab"]
plt.bar(x,accuracy_scores)
for i in range(len(x)):
    plt.text(x[i], accuracy_scores[i], f'{accuracy_scores[i]:.2f}', ha='center', va='bottom')
plt.xlabel("Algorithms")
plt.ylabel("Accuracy Scores")
plt.title("Accuracy Scores for Different Algorithms")
plt.plot()
plt.show()

**Feature selection using correlation**

In [None]:
df.corr().iloc[:,-1]

In [None]:
df.corr()

In [None]:
threshold = 0.05
correlation_values = df.corr().abs()
columns_to_drop = correlation_values[correlation_values['Credit_Score'] < threshold].index.tolist()
df1= df.drop(columns=columns_to_drop)

In [None]:
df1

In [None]:
X=df1.iloc[:,:-1]
X

In [None]:
y=df1.iloc[:,-1]
y

In [None]:
X1_scaled=scaler.fit_transform(X)
X1_scaled

In [None]:
X1_train,X1_test,y1_train,y1_test=train_test_split(X1_scaled,y,test_size=0.3,random_state=1)

In [None]:
knn_co=KNeighborsClassifier()
sv_co=SVC()
nb_co=GaussianNB()
dt_co=DecisionTreeClassifier()
rf_co=RandomForestClassifier(random_state=1)
ab_co=AdaBoostClassifier(random_state=1)

In [None]:
accuracy_scores_co=[]
models_co=[knn_co,sv_co,nb_co,dt_co,rf_co,ab_co]
for i in models_co:
  # print("*****************",i,"**********************")
  i.fit(X1_train,y1_train)
  y1_pred=i.predict(X1_test)
  accuracy_co=(accuracy_score(y1_test,y1_pred))*100
  accuracy_scores_co.append(accuracy_co)

In [None]:
x_co=["knn_co","sv_co","nb_co","dt_co","rf_co","ab_co"]
plt.bar(x_co,accuracy_scores_co)
for i in range(len(x_co)):
    plt.text(x_co[i], accuracy_scores_co[i], f'{accuracy_scores_co[i]:.2f}', ha='center', va='bottom')
plt.xlabel("Algorithms")
plt.ylabel("Accuracy Scores")
plt.title("Accuracy Scores for Different Algorithms with feature selection using correlation")
plt.plot()
plt.show()

**Oulier Handling**

In [None]:
numerical_features =X.columns
for feature in numerical_features:
    # plt.figure(figsize=(8, 4))
    sns.boxplot(y=df[feature])
    plt.title(f'Box Plot for {feature}')
    plt.show()

In [None]:
for i in numerical_features:
  Q1 = df1[i].quantile(0.25)
  Q3 = df1[i].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df1 = df1.loc[(df1[i] >= lower_bound) & (df1[i] <= upper_bound)]

In [None]:
df1

In [None]:
df.shape

In [None]:
X=df1.iloc[:,:-1]
X

In [None]:
y=df1.iloc[:,-1]
y

In [None]:
scaler_no_outliers=MinMaxScaler()
X_scaled_no_outliers=scaler_no_outliers.fit_transform(X)

In [None]:
X_train_no_outliers,X_test_no_outliers,y_train_no_outliers,y_test_no_outliers=train_test_split(X_scaled_no_outliers,y,test_size=0.3,random_state=1)

In [None]:
knn_no=KNeighborsClassifier()
sv_no=SVC()
nb_no=GaussianNB()
dt_no=DecisionTreeClassifier()
rf_no=RandomForestClassifier(random_state=1)
ab_no=AdaBoostClassifier(random_state=1)

In [None]:
accuracy_scores_no_outliers=[]
models_no_outliers=[knn_no,sv_no,nb_no,dt_no,rf_no,ab_no]
for i in models_no_outliers:
  # print("*****************",i,"**********************")
  i.fit(X_train_no_outliers,y_train_no_outliers)
  y_pred_no_outliers=i.predict(X_test_no_outliers)
  accuracy_no_outliers=(accuracy_score(y_test_no_outliers,y_pred_no_outliers))*100
  accuracy_scores_no_outliers.append(accuracy_no_outliers)
  # print(classification_report(y_test,y_pred))

In [None]:
x_no_outliers=["knns","sv","nb","dt","rf","ab"]
plt.bar(x_no_outliers,accuracy_scores_no_outliers)
for i in range(len(x_no_outliers)):
    plt.text(x_no_outliers[i], accuracy_scores_no_outliers[i], f'{accuracy_scores_no_outliers[i]:.2f}', ha='center', va='bottom')
plt.xlabel("Algorithms")
plt.ylabel("Accuracy Scores")
plt.title("Accuracy Scores for Different Algorithms after outlier removal")
plt.plot()
plt.show()

**Checking imbalanced dataset**"

In [None]:
df["Credit_Score"].value_counts()

In [None]:
sns.countplot(x="Credit_Score",data=df)

**Over Sampling**

In [None]:
from imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=1)
X_os,y_os=oversampler.fit_resample(X,y)

In [None]:
X_train_os,X_test_os,y_train_os,y_test_os=train_test_split(X_os,y_os,test_size=0.3,random_state=1)

In [None]:
X_train_os=scaler.fit_transform(X_train_os)
X_test_os=scaler.transform(X_test_os)

In [None]:
knn_os=KNeighborsClassifier()
sv_os=SVC()
nb_os=GaussianNB()
dt_os=DecisionTreeClassifier()
rf_os=RandomForestClassifier(random_state=1)
ab_os=AdaBoostClassifier(random_state=1)

In [None]:
accuracy_scores_os=[]
models=[knn_os,sv_os,nb_os,dt_os,rf_os,ab_os]
for i in models:
  print("*****************",i,"**********************")
  i.fit(X_train_os,y_train_os)
  y_pred_os=i.predict(X_test_os)
  accuracy_os=(accuracy_score(y_test_os,y_pred_os))*100
  accuracy_scores_os.append(accuracy_os)
  print(classification_report(y_test_os,y_pred_os))

In [None]:
x_os=["knn_os","sv_os","nb_os","dt_os","rf_os","ab_os"]
plt.bar(x_os,accuracy_scores_os)
for i in range(len(x_os)):
    plt.text(x_os[i], accuracy_scores_os[i], f'{accuracy_scores_os[i]:.2f}', ha='center', va='bottom')
plt.xlabel("Algorithms")
plt.ylabel("Accuracy Scores")
plt.title("Accuracy Scores for Different Algorithms with Over Sampling")
plt.plot()
plt.show()

**Under Sampling**

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersampler=RandomUnderSampler(random_state=1)
X_us,y_us=undersampler.fit_resample(X,y)


In [None]:
X_train_us,X_test_us,y_train_us,y_test_us=train_test_split(X_us,y_us,test_size=0.3,random_state=1)

In [None]:
X_train_us=scaler.fit_transform(X_train_us)
X_test_us=scaler.transform(X_test_us)

In [None]:
knn_us=KNeighborsClassifier()
sv_us=SVC()
nb_us=GaussianNB()
dt_us=DecisionTreeClassifier()
rf_us=RandomForestClassifier(random_state=1)
ab_us=AdaBoostClassifier(random_state=1)

In [None]:
accuracy_scores_us=[]
models=[knn_us,sv_us,nb_us,dt_us,rf_us,ab_us]
for i in models:
  # print("*****************",i,"**********************")
  i.fit(X_train_us,y_train_us)
  y_pred_us=i.predict(X_test_us)
  accuracy_us=(accuracy_score(y_test_us,y_pred_us))*100
  accuracy_scores_us.append(accuracy_us)

In [None]:
x_us=["knn_us","sv_us","nb_us","dt_us","rf_us","ab_us"]
plt.bar(x_us,accuracy_scores_us)
for i in range(len(x_us)):
    plt.text(x_us[i], accuracy_scores_us[i], f'{accuracy_scores_us[i]:.2f}', ha='center', va='bottom')
plt.xlabel("Algorithms")
plt.ylabel("Accuracy Scores")
plt.title("Accuracy Scores for Different Algorithms with Under Sampling")
plt.plot()
plt.show()

**The Random Forest Model, coupled with oversampling, exhibits high accuracy(85%), precision, recall, and F1 score, making it a suitable choice for predictive purposes.**

In [None]:
import joblib

In [None]:
filename="model.pkl"

In [None]:
joblib.dump(rf_os,open(filename,"wb"))
joblib.dump(scaler,open("mscaler.pkl","wb"))

In [None]:
!pip install streamlit

In [None]:
# !wget -q -O - ipv4.icanhazip.com

In [None]:
# ! streamlit run app.py & npx localtunnel --port 8501

/bin/bash: line 1: streamlit: command not found
[K[?25hnpx: installed 22 in 4.151s
your url is: https://ninety-pens-appear.loca.lt
