# Interactive notebook used to prepare code parts for Azure ML

### Install libraries

In [11]:
%pip install pandas numpy scikit-learn mlflow tensorflow imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
     -------------------------------------- 226.0/226.0 KB 3.5 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Python310\python.exe -m pip install --upgrade pip' command.


### Load data

In [2]:
import pandas as pd
import numpy as np

In [16]:
# Load raw data
df = pd.read_csv("../data/lending_club.csv")

df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,...,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\r\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,...,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,...,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,...,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\r\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,...,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\r\nGreggshire, VA 11650"


In [25]:
# How well is label balanced?
df['loan_status'].groupby(df['loan_status']).count().sort_values(ascending=False)

KeyError: 'loan_status'

### Prepare data

In [17]:
# Create binary label
if 'loan_status' in df:
    print("loan_status column exists, create binary label loan_repaid")
    df['loan_repaid'] = np.where(df['loan_status'] == 'Fully Paid', 1, 0)
    df = df.drop(labels='loan_status', axis=1)
else:
    print("loan_status column does not exist, ignore")

# Drop features
df = df.drop(labels=['grade', 'issue_d'] , axis=1)

# Fill-in mort acc
mortmeans = df.groupby(by='total_acc')['mort_acc'].mean()
def myfill(total,mort):
    if pd.isna(mort):
        return mortmeans[total]
    else:
        return mort

df['mort_acc'] = df.apply(lambda x: myfill(x.total_acc, x.mort_acc), axis=1)

# Keep top 30 most frequent values for emp_title
top_values = df['emp_title'].groupby(df['emp_title']).count().sort_values(ascending=False).head(30).index.tolist()
df['emp_title'] = df['emp_title'].apply(lambda x: 'Other' if x not in top_values else x)

# Keep top 30 most frequent values for title
top_values = df['title'].groupby(df['title']).count().sort_values(ascending=False).head(30).index.tolist()
df['title'] = df['title'].apply(lambda x: 'Other' if x not in top_values else x)

# Drop rows with missing data
df = df.dropna()

# Convert term feature to number
def convert_term(x):
    if x == ' 36 months': return 36
    if x == ' 60 months': return 60

df['term'] = df['term'].apply(lambda x: convert_term(x))

# Home ownership feature - join categories
def homeownership(x):
    if (x == 'NONE') or (x == 'ANY'): return 'OTHER'
    else: return x

df['home_ownership'].apply(lambda x: homeownership(x)).unique()

# Parse zip
df['address'] = df['address'].str[-5:]

# Parse year
df['earliest_cr_year'] = pd.to_numeric(df['earliest_cr_line'].str[-4:])
df = df.drop(labels='earliest_cr_line', axis=1)

df.head()

loan_status column exists, create binary label loan_repaid


Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,...,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address,loan_repaid,earliest_cr_year
0,10000.0,36,11.44,329.48,B4,Other,10+ years,RENT,117000.0,Not Verified,...,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,22690,1,1990
1,8000.0,36,11.99,265.68,B5,Other,4 years,MORTGAGE,65000.0,Not Verified,...,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,5113,1,2004
2,15600.0,36,10.49,506.97,B3,Other,< 1 year,RENT,43057.0,Source Verified,...,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,5113,1,2007
3,7200.0,36,6.49,220.65,A2,Other,6 years,RENT,54000.0,Not Verified,...,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,813,1,2006
4,24375.0,60,17.27,609.33,C5,Other,9 years,MORTGAGE,55000.0,Verified,...,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,11650,0,1999


In [18]:
# Get dummies (OHE)
df = pd.get_dummies(columns=['sub_grade', 'verification_status', 'application_type', 'initial_list_status', 'purpose', 'home_ownership', 'address', 'emp_title', 'title', 'emp_length'], data=df,drop_first=True)

df.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'annual_inc', 'dti',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       ...
       'emp_length_10+ years', 'emp_length_2 years', 'emp_length_3 years',
       'emp_length_4 years', 'emp_length_5 years', 'emp_length_6 years',
       'emp_length_7 years', 'emp_length_8 years', 'emp_length_9 years',
       'emp_length_< 1 year'],
      dtype='object', length=150)

In [19]:
df.count(axis=1)

0         150
1         150
2         150
3         150
4         150
         ... 
396025    150
396026    150
396027    150
396028    150
396029    150
Length: 376929, dtype: int64

In [21]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
df2 = sm.fit_resample(df, df['loan_repaid'])

### Split data

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [36]:
# Get features and labels
X = df.drop("loan_repaid",axis=1).values
y = df["loan_repaid"].values

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [38]:
# Oversample train data to get more balanced label
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train, y_train = sm.fit_resample(X_train, y_train.ravel())

In [9]:
# Scale features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
import joblib
joblib.dump(scaler, "./scaler.save")

['./scaler.save']

In [10]:
np.savetxt("./X_train.csv", X_train, delimiter=",")
np.savetxt("./X_test.csv", X_test, delimiter=",")
np.savetxt("./y_train.csv", y_train, delimiter=",")
np.savetxt("./y_test.csv", y_test, delimiter=",")

In [11]:
df.drop("loan_repaid",axis=1).columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'annual_inc', 'dti',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'mort_acc', 'pub_rec_bankruptcies', 'earliest_cr_year', 'sub_grade_A2',
       'sub_grade_A3', 'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1',
       'sub_grade_B2', 'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5',
       'sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4',
       'sub_grade_C5', 'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3',
       'sub_grade_D4', 'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2',
       'sub_grade_E3', 'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1',
       'sub_grade_F2', 'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5',
       'sub_grade_G1', 'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4',
       'sub_grade_G5', 'verification_status_Source Verified',
       'verification_status_Verified', 'application_type_INDIVIDUAL',
       'application_type_JOINT', 'initial_list_status_w',
       'purpose_c