In [260]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import statsmodels.api as sm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
loan=pd.read_csv('../cleaned_loan.csv')
loan.shape

In [262]:
X=loan.drop(columns=['loan_status','loan_status_grouped','emp_title','purpose','title','issue_d','grade'])
y=loan['loan_status_grouped']

In [None]:
X.shape

In [None]:
X.head()

In [265]:
# Encode sub_grade into numerical values
def sub_grades_encoding(x):
    grade_mapping = {'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3, 'F': 2, 'G': 1}
    number_mapping = {'1': 0.8, '2': 0.6, '3': 0.4, '4': 0.2, '5': 0.0}
    
    val = grade_mapping.get(x[0], 0) + number_mapping.get(x[1], 0)
    return val

X['sub_grade']= X['sub_grade'].apply(sub_grades_encoding)

#Separate numerical and non-numerical columns
ordinal_columns = ['emp_length']
categorical_columns = ['home_ownership', 'term']
numerical_columns = X.select_dtypes(include=['float64']).columns

# Apply Label Encoding to ordinal columns before splitting
le = LabelEncoder()
for col in ordinal_columns:
    X[col] = le.fit_transform(X[col])

# Apply OneHotEncoding to categorical columns before splitting
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Now perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Ensure the train and test sets have the same structure (dummies already handled)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Apply scaling to numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [None]:
# Convert columns in X_train to numeric
boolean_columns = ['home_ownership_ANY','home_ownership_MORTGAGE', 'home_ownership_NONE','home_ownership_OTHER','home_ownership_OWN','home_ownership_RENT','term_ 36 months','term_ 60 months']
X_train[boolean_columns] = X_train[boolean_columns].astype(int)
X_test[boolean_columns] = X_test[boolean_columns].astype(int)

# Dropping variables to avoid multicolinearity and quasi-seperation
X_train = X_train.drop(['home_ownership_NONE', 'term_ 36 months', 'funded_amnt', 'installment', 'total_pymnt'], axis=1)
X_test = X_test.drop(['home_ownership_NONE', 'term_ 36 months', 'funded_amnt', 'installment', 'total_pymnt'], axis=1)

# Convert y_train to numeric (if it is not already)
y_train = pd.to_numeric(y_train, errors='coerce')

# Drop rows with NaN values in X_train or y_train
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Ensure y_train matches X_train

X_train

In [None]:
# Using statsmodels library to run Logit Regression

# Add a constant to the independent variables
X_train_with_const = sm.add_constant(X_train)
X_test_with_const = sm.add_constant(X_test)

# Fit the logistic regression model
logit_model = sm.Logit(y_train, X_train_with_const)

# Fit the model and obtain the result object
result = logit_model.fit(maxiter=2000)

# Print the summary of the logistic regression results
print(result.summary())

In [None]:
# Make Predictions
predicted_probs = result.predict(X_test_with_const)  # Predict probabilities
predicted_classes = (predicted_probs >= 0.5).astype(int)  # Convert probabilities to binary classes

# Generate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the accuracy score and classication report
print(f'Accuracy: {accuracy:.2f}',"\n")
print("Classification Report:\n", class_report)

# Plot confusion matrix using seaborn heatmap
sns.heatmap(conf_matrix, annot=True, fmt='.0f', cmap='Blues', cbar=True, xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()