In [22]:
import pandas as pd
import altair as alt
import pandasql as psql

data = pd.read_csv('credit_risk_dataset.csv').dropna()
chartable = data.head(5000)
not_paid = chartable[chartable['loan_status'] == 0]
data.head(5)
largest_unpaid = not_paid["loan_amnt"].idxmax()
print(f'The largest loan amount that in default is {not_paid.loc[largest_unpaid, "loan_amnt"]} and that person makes {chartable.loc[largest_unpaid, "person_income"]} annually')


The largest loan amount that in default is 35000 and that person makes 115000 annually


In [23]:
# Enhanced area chart for loan amount vs person income
area_loan_amnt = alt.Chart(chartable).mark_area().encode(
    alt.X('loan_amnt', title='Loan Amount', scale=alt.Scale(domain=(0, 35000))),  # Adjusted domain to match typical loan amounts
    alt.Y('person_income', title='Annual Income', scale=alt.Scale(domain=(0, 350000))),  # Added scale for y-axis
    color=alt.Color('loan_status', title='Loan Status'),
    tooltip=['loan_amnt', 'person_income', 'loan_status']
).properties(
    width=800,
    title='Loan Amount vs Person Income'
)


In [6]:
import altair as alt
import pandas as pd

alt.data_transformers.disable_max_rows()
chartable = pd.read_csv('credit_risk_dataset.csv').dropna()
chartable['loan_status_mapped'] = chartable['loan_status'].map({1: '1: Default', 0: '0: Fully Paid'})
chartable['person_age'] = chartable[chartable['person_age'] <= 100]['person_age']


# Existing histograms
hist_age = alt.Chart(chartable).mark_bar().encode(
    alt.X('person_age:Q', bin=alt.Bin(maxbins=30), title='Person Age'),
    alt.Y('count()', title='Count'),
    color='loan_status_mapped:O',
    tooltip=['count()', 'person_age','loan_status_mapped:O',]
).properties(
    width=300,
    height=300,
    title='Distribution of Person Age'
)

hist_income = alt.Chart(chartable).mark_bar().encode(
    alt.X('loan_percent_income:Q', bin=alt.Bin(maxbins=30), title='Loan % Income Income'),
    alt.Y('count()', title='Count'),
    color='loan_status_mapped:O',
    tooltip=['count()', 'loan_percent_income','loan_status_mapped']
).properties(
    width=300,
    height=300,
    title='Distribution of Loan & Income Ratio'
)

hist_loan_amnt = alt.Chart(chartable).mark_bar().encode(
    alt.X('loan_amnt:Q', bin=alt.Bin(maxbins=30), title='Loan Amount'),
    alt.Y('count()', title='Count'),
    color='loan_status_mapped:O',
    tooltip=['count()', 'loan_amnt','loan_status_mapped']
).properties(
    width=300,
    height=300,
    title='Distribution of Loan Amount'
)

hist_loan_int_rate = alt.Chart(chartable).mark_bar().encode(
    alt.X('loan_int_rate:Q', bin=alt.Bin(maxbins=30), title='Loan Interest Rate'),
    alt.Y('count()', title='Count'),
    color='loan_status_mapped:O',
    tooltip=['count()', 'loan_int_rate','loan_status_mapped']
).properties(
    width=300,
    height=300,
    title='Distribution of Loan Interest Rate'
)

hist_loan_grade = alt.Chart(chartable).mark_bar().encode(
    alt.X('loan_grade', title='Loan Grade'),
    alt.Y('count()', title='Count'),
    color='loan_status_mapped:O',
    tooltip=['count()', 'loan_grade','loan_status_mapped']
).properties(
    width=300,
    height=300,
    title='Distribution of Loan Grade'
)

hist_loan_intent = alt.Chart(chartable).mark_bar().encode(
    alt.X('loan_intent', title='Loan Intent'),
    alt.Y('count()', title='Count'),
    color='loan_status_mapped:O',
    tooltip=['count()', 'loan_intent','loan_status_mapped']
).properties(
    width=300,
    height=300,
    title='Distribution of Loan Intent'
)

# Concatenate the histograms and the area chart in a grid layout
combined_charts = alt.vconcat(
    alt.hconcat(hist_age, hist_income, hist_loan_intent),
    alt.hconcat(hist_loan_amnt, hist_loan_int_rate, hist_loan_grade),
)

combined_charts.save('Property_Distribution.png')

In [25]:
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder

# def preprocess(data = data):
loan_grade_order = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
home_ownership_order = ['OWN', 'MORTGAGE', 'RENT', 'OTHER']

loan_grade_encoder = OrdinalEncoder(categories=[loan_grade_order], dtype=int)
home_ownership_encoder = OrdinalEncoder(categories=[home_ownership_order], dtype=int)

data['loan_grade'] = loan_grade_encoder.fit_transform(data.loan_grade.values.reshape(-1,1))
data['person_home_ownership'] = home_ownership_encoder.fit_transform(data.person_home_ownership.values.reshape(-1,1))
data = pd.get_dummies(data, columns=['loan_intent'], drop_first=True)
data['cb_person_default_on_file'] = data['cb_person_default_on_file'].map({'Y': 1, 'N': 0})
data['DTI'] = data['loan_amnt'] / data['person_income']
data = data.astype(float)

numeric_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_percent_income', 'cb_person_cred_hist_length', 'DTI']

standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

data[numeric_features] = standard_scaler.fit_transform(data[numeric_features])
data['loan_int_rate'] = robust_scaler.fit_transform(data['loan_int_rate'].values.reshape(-1,1))

# return data
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,DTI
0,-0.907594,-0.122674,2.0,28.453433,3.0,4.003984,0.901434,1.0,3.952527,1.0,-0.691791,0.0,0.0,0.0,1.0,0.0,3.969132
1,-1.066064,-0.914907,0.0,0.050866,1.0,-1.367627,0.026882,0.0,-0.653141,0.0,-0.939413,1.0,0.0,0.0,0.0,0.0,-0.615762
2,-0.432183,-0.914907,1.0,-0.911932,2.0,-0.656678,0.336918,1.0,3.76454,0.0,-0.691791,0.0,0.0,1.0,0.0,0.0,3.778785
3,-0.749123,-0.018433,2.0,-0.189833,2.0,4.003984,0.759857,1.0,3.388567,0.0,-0.939413,0.0,0.0,1.0,0.0,0.0,3.417232
4,-0.590653,-0.196445,2.0,0.772966,2.0,4.003984,0.587814,1.0,3.576554,1.0,-0.444169,0.0,0.0,1.0,0.0,0.0,4.439403


In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def evaluate_model(y_test, y_pred):
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    return [conf_matrix, class_report]



In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

X = data.drop('loan_status', axis=1)
y = data['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)



Confusion Matrix:
[[4248  195]
 [ 616  669]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.96      0.91      4443
         1.0       0.77      0.52      0.62      1285

    accuracy                           0.86      5728
   macro avg       0.82      0.74      0.77      5728
weighted avg       0.85      0.86      0.85      5728



[array([[4248,  195],
        [ 616,  669]]),
 '              precision    recall  f1-score   support\n\n         0.0       0.87      0.96      0.91      4443\n         1.0       0.77      0.52      0.62      1285\n\n    accuracy                           0.86      5728\n   macro avg       0.82      0.74      0.77      5728\nweighted avg       0.85      0.86      0.85      5728\n']

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# List of models to test
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print("\n")

Model: Logistic Regression
              precision    recall  f1-score   support

         0.0       0.87      0.96      0.91      4443
         1.0       0.77      0.52      0.62      1285

    accuracy                           0.86      5728
   macro avg       0.82      0.74      0.77      5728
weighted avg       0.85      0.86      0.85      5728



Model: Decision Tree
              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93      4443
         1.0       0.77      0.78      0.77      1285

    accuracy                           0.90      5728
   macro avg       0.85      0.85      0.85      5728
weighted avg       0.90      0.90      0.90      5728



Model: Random Forest
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      4443
         1.0       0.97      0.73      0.83      1285

    accuracy                           0.93      5728
   macro avg       0.95      0.86      0.90      5728
w



In [None]:
conf_matrix_df = pd.DataFrame(matrix, 
                            columns=['Predicted Negative', 'Predicted Positive'], 
                            index=['Actual Negative', 'Actual Positive'])

# Melt the DataFrame to long format
conf_matrix_melted = conf_matrix_df.reset_index().melt(id_vars='index')
conf_matrix_melted.columns = ['Actual', 'Predicted', 'Count']

# Create the heatmap using Altair
heatmap = alt.Chart(conf_matrix_melted).mark_rect().encode(
    x='Predicted:O',
    y='Actual:O',
    color='Count:Q',
    tooltip=['Actual', 'Predicted', 'Count']
).properties(
    width=300,
    height=300,
    title='Confusion Matrix'
)

# Add text annotations
text = heatmap.mark_text(baseline='middle').encode(
    text='Count:Q',
    color=alt.condition(
        alt.datum.Count > conf_matrix_melted['Count'].mean(), 
        alt.value('black'), 
        alt.value('white')
    )
)

# Combine heatmap and text
conf_matrix_chart = heatmap + text

# Display the chart


NameError: name 'matrix' is not defined

In [None]:
feature_importance = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)

# Create the bar chart
bar_chart = alt.Chart(importance_df.head(10)).mark_bar().encode(
    x=alt.X('Importance:Q', title='Importance'),
    y=alt.Y('Feature:O', sort='-x', title='Feature'),
    tooltip=['Feature', 'Importance'],
    color=alt.Color('Importance:Q', scale=alt.Scale(scheme='viridis'))
).properties(
    width=800,
    title='Top 10 Feature Importance'
)




AttributeError: 'MLPClassifier' object has no attribute 'feature_importances_'

In [None]:
X = data.drop('loan_status', axis=1)
y = data['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
eval = evaluate_model(y_test, y_pred)
eval
plot_feature_importance(model, X).display()
plot_confusion_matrix(eval[0])


Confusion Matrix:
[[4366   77]
 [ 379  906]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95      4443
         1.0       0.92      0.71      0.80      1285

    accuracy                           0.92      5728
   macro avg       0.92      0.84      0.87      5728
weighted avg       0.92      0.92      0.92      5728





NameError: name 'plot_feature_importance' is not defined

In [None]:
import lightgbm as lgb

lgb.plot_importance(model, figsize=(12, 25), max_num_features=100)
