# Cloud-Driven Loan Default Predictor using Machine Learning

## Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample, shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import warnings
import boto3
from sagemaker import get_execution_role

warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'boto3'

## Task I - Data Loading

Instructions:
- Build the S3 path for the dataset loan_cleaned_data.csv using string formatting to concatenate the bucket name, folder name and file key i.e the name of the dataset.
- Note: Bucket name - than dataXYZXYZ (XYZXYZ can be any random integers) & Folder name - than cleaned data.
- Load the dataset into a pandas DataFrame.

Hint: Sample S3 URI - "s3://bucket_name/folder_name/file_name.csv"

In [None]:
bucket = 'data123456'  # Replace with your actual bucket name
folder_name = 'cleaned data'
data_key = 'loan_cleaned_data.csv'
data_location = f's3://{bucket}/{folder_name}/{data_key}'

# Load the dataset
data = pd.read_csv(data_location)
print(data.head())

## Task II - Feature Engineering

Instructions:
- Convert the values in the categorical column 'purpose' into numerical format using One-hot Encoding.
- The datatype of the new columns should be int.

In [None]:
# Perform one-hot encoding on the 'purpose' column
data = pd.get_dummies(data, columns=['purpose'], dtype=int)
print(data.head())

## Task III - Data Preprocessing

Instructions:
- Inspect the target column 'not_fully_paid' and identify the count of records belonging to the two classes.
- Filter out the majority and minority classes and store them separately.
- Handle the data imbalance by oversampling the minority class using the resample method so that the final count of records in both the classes becomes equal.
- Store the result in the variable df_minority_upsampled.
- Concatenate the upsampled minority data with the majority and assign the result to the new dataframe df.
- Inspect the target column of the new dataframe to verify that the data is balanced.

In [None]:
# Separate majority and minority classes
df_majority = data[data['not_fully_paid'] == 0]
df_minority = data[data['not_fully_paid'] == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine majority class with upsampled minority class
df = pd.concat([df_majority, df_minority_upsampled])

# Display the distribution of target variable
print(df['not_fully_paid'].value_counts())

## Task IV - Model Training

Instructions:
- Drop the columns 'slno' and 'not_fully_paid' and create a dataframe of independent variables named X.
- Filter the dependent variable and store it in y.
- Split the data into training and test sets using 60:40 ratio. Use a random state equal to 42.
- Train a Random Forest Classifier model called rf using the training data. Use a random state equal to 42.

In [None]:
# Create X and y data for train-test split
X = df.drop(['slno', 'not_fully_paid'], axis=1)
y = df['not_fully_paid']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train a Random Forest Classifier model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

print("Model training completed.")

## Task V - Model Evaluation

Instructions:
- Predict using the trained Random Forest Classifier model rf on the test data X_test.
- Evaluate the predictions by comparing it with the actual test data y_test.
- Print the classification report to determine the evaluation metric scores.

In [None]:
# Predict using the trained Random Forest Classifier model
pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, pred))

## Task VI - Saving the Model to AWS S3

Instructions:
- Serialize the trained Random Forest model using joblib.
- Initialize the S3 client using the boto3 library.
- Save the serialized model to a temporary file using tempfile.
- Upload the model file to the specified S3 bucket named loan-data.
- Ensure the model is saved as model.pkl in the S3 bucket.

In [None]:
import tempfile
import joblib

BUCKET_NAME = 'loan-data'
s3_client = boto3.client('s3')
model_name = 'model.pkl'

# Save to S3
with tempfile.TemporaryFile() as fp:
    joblib.dump(rf, fp)
    fp.seek(0)
    s3_client.put_object(
        Body=fp.read(),
        Bucket=BUCKET_NAME,
        Key=model_name
    )

print(f'Model saved to S3 as: {model_name}')