In [33]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [34]:
df = pd.read_csv("train.csv")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [37]:
df = df.dropna(how="any")

In [39]:
df["Credit_Score"].value_counts()

Standard    53174
Poor        28998
Good        17828
Name: Credit_Score, dtype: int64

In [40]:
color_discrete_map = {
    "Poor": "red",
    "Standard": "yellow",
    "Good": "green"
}

In [41]:
fig = px.box(df, x="Occupation", color="Credit_Score", color_discrete_map=color_discrete_map)
fig.show()

In [42]:
y_arr = ["Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card", 
         "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment", 
         "Outstanding_Debt", "Credit_Utilization_Ratio", "Credit_History_Age", "Total_EMI_per_month",
         "Amount_invested_monthly", "Monthly_Balance"]

In [None]:
for i in range(len(y_arr)):
    title = "Credit Scores based on " + str(y_arr[i])
    fig = px.box(df, x="Credit_Score", y=y_arr[i], color="Credit_Score", title=title, color_discrete_map=color_discrete_map)
    fig.update_traces(quartilemethod="exclusive")
    fig.show()

In [44]:
df["Credit_Mix"] = df["Credit_Mix"].map({
    "Bad": 0, "Standard": 1, "Good": 2
})

In [45]:
df = df.drop(["ID", "Customer_ID", "Month", "Name", "Age", "SSN", "Occupation", "Type_of_Loan", "Num_Credit_Inquiries",
              "Changed_Credit_Limit", "Credit_Utilization_Ratio", "Payment_of_Min_Amount", "Total_EMI_per_month",
              "Amount_invested_monthly", "Payment_Behaviour"], axis=1)

In [47]:
X = df.drop(["Credit_Score"], axis=1)
X = np.array(X)

y = df["Credit_Score"]
y = np.array(y)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4242)

In [49]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [50]:
y_pred = model.predict(X_test)

In [51]:
accuracy_score(y_test, y_pred)

0.8038333333333333

In [52]:
pickle.dump(model, open("model.pkl", "wb"))