In [1]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings
import joblib
import pickle

warnings.filterwarnings(action='ignore')

In [2]:
# Reading the dataset
dataset_path = 'loan.csv'
df = pd.read_csv(dataset_path, low_memory=False)
print("Dataset Shape:", df.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'loan.csv'

In [3]:
# Selecting relevant features
lucky_features = [
    'loan_amnt', 'term', 'int_rate', 'grade', 'emp_length',
    'annual_inc', 'purpose', 'dti'
]
df = df[lucky_features]
print("Reduced Dataset Shape:", df.shape)

Reduced Dataset Shape: (2260668, 8)


In [4]:
# Handle missing values
df['emp_length'] = df['emp_length'].fillna('0')  # Replace missing employment length with '0'
df['annual_inc'] = df['annual_inc'].fillna(df['annual_inc'].median())  # Fill missing income with median
df['dti'] = df['dti'].fillna(df['dti'].median())  # Fill missing debt-to-income ratio with median

In [5]:
# Transform 'term' to numeric
df['term'] = df['term'].str.extract('(\\d+)').astype(int)

In [6]:
# Map 'grade' to numeric
grade_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}
df['grade'] = df['grade'].map(grade_mapping)

In [7]:
# Transform 'emp_length' to numeric
df['emp_length'] = df['emp_length'].str.extract('(\\d+)').astype(float).fillna(0)

In [8]:
# One-hot encode 'purpose'
df = pd.get_dummies(df, columns=['purpose'], drop_first=True)

In [9]:
# Display the transformed dataset
print("Transformed Dataset Info:")
print(df.info())

Transformed Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260668 entries, 0 to 2260667
Data columns (total 20 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   loan_amnt                   int64  
 1   term                        int32  
 2   int_rate                    float64
 3   grade                       int64  
 4   emp_length                  float64
 5   annual_inc                  float64
 6   dti                         float64
 7   purpose_credit_card         bool   
 8   purpose_debt_consolidation  bool   
 9   purpose_educational         bool   
 10  purpose_home_improvement    bool   
 11  purpose_house               bool   
 12  purpose_major_purchase      bool   
 13  purpose_medical             bool   
 14  purpose_moving              bool   
 15  purpose_other               bool   
 16  purpose_renewable_energy    bool   
 17  purpose_small_business      bool   
 18  purpose_vacation            bool   


In [10]:
# Splitting dataset into features and target
X = df.drop('grade', axis=1)
y = df['grade']

# Splitting into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [11]:
# Scaling the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
# Save the scaler for future use
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [13]:
# Training a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=50,        # Reduce number of trees
    max_depth=10,           # Limit tree depth
    max_features="sqrt",    # Use fewer features for splits
    min_samples_split=10,   # Reduce splits
    random_state=100
)
rf_model.fit(X_train, y_train)

In [14]:
# Save the trained model
joblib.dump(rf_model, "random_forest_model_compressed.pkl", compress=3)

['random_forest_model_compressed.pkl']

In [15]:
# Evaluating the model on the test set
y_pred = rf_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['A', 'B', 'C', 'D', 'E', 'F', 'G']))


Confusion Matrix:
 [[ 98982   9218      0      0      0      0      0]
 [   603 152212  13344      0      0      0      0]
 [    54   5044 151549   6063      0      0      0]
 [    36     10   8564  71534    641      0      0]
 [    14      1    291  20660  12986      4      0]
 [     7      0     73   1765   6743   1754      0]
 [     1      0     17    636   1657    688     16]]

Accuracy Score: 0.8652893746450164

Classification Report:
               precision    recall  f1-score   support

           A       0.99      0.91      0.95    108200
           B       0.91      0.92      0.92    166159
           C       0.87      0.93      0.90    162710
           D       0.71      0.89      0.79     80785
           E       0.59      0.38      0.46     33956
           F       0.72      0.17      0.27     10342
           G       1.00      0.01      0.01      3015

    accuracy                           0.87    565167
   macro avg       0.83      0.60      0.62    565167
weighted avg 

In [16]:
# Testing the model with sample input data
sample_data = {
    'loan_amnt': 15000,
    'term': 36,
    'int_rate': 12.5,
    'emp_length': '5 years',
    'annual_inc': 70000,
    'purpose': 'credit_card',  # Original value
    'dti': 18.3
}

# Convert sample data to DataFrame
input_data = pd.DataFrame([sample_data])



In [17]:
# Preprocessing sample data
input_data['term'] = input_data['term'].astype(int)
input_data['emp_length'] = input_data['emp_length'].str.extract('(\\d+)').astype(float).fillna(0)
input_data = pd.get_dummies(input_data, columns=['purpose'], drop_first=True)



In [18]:
# Add missing columns
missing_cols = set(X.columns) - set(input_data.columns)
for col in missing_cols:
    input_data[col] = 0
input_data = input_data[X.columns]  # Ensure the order of columns matches

# Scale the sample data
input_scaled = scaler.transform(input_data)


In [20]:

# Load model and predict
rf_model = joblib.load('random_forest_model_compressed.pkl')
prediction = rf_model.predict(input_scaled)


In [21]:
# Map prediction back to grade
reverse_grade_mapping = {v: k for k, v in grade_mapping.items()}
predicted_grade = reverse_grade_mapping[prediction[0]]
accuracy = accuracy_score(y_test, y_pred)
print(f"Sample Prediction: Credit Grade is {predicted_grade} (Model Accuracy: {accuracy:.2%})")

Sample Prediction: Credit Grade is C (Model Accuracy: 86.53%)
