# Interactive notebook used to prepare code parts for Azure ML

### Install libraries

In [48]:
%pip install pandas numpy scikit-learn mlflow tensorflow

Collecting mlflow
  Using cached mlflow-2.0.1-py3-none-any.whl (16.5 MB)
Collecting tensorflow
  Using cached tensorflow-2.11.0-cp310-cp310-win_amd64.whl (1.9 kB)
Collecting alembic<2
  Using cached alembic-1.8.1-py3-none-any.whl (209 kB)
Collecting cloudpickle<3
  Using cached cloudpickle-2.2.0-py3-none-any.whl (25 kB)
Collecting docker<7,>=4.0.0
  Using cached docker-6.0.1-py3-none-any.whl (147 kB)
Collecting Jinja2<4,>=3.0
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting gitpython<4,>=2.1.0
  Using cached GitPython-3.1.29-py3-none-any.whl (182 kB)
Collecting shap<1,>=0.40
  Using cached shap-0.41.0-cp310-cp310-win_amd64.whl (435 kB)
Collecting markdown<4,>=3.3
  Using cached Markdown-3.4.1-py3-none-any.whl (93 kB)
Collecting pyarrow<11,>=4.0.0
  Using cached pyarrow-10.0.1-cp310-cp310-win_amd64.whl (20.2 MB)
Collecting protobuf<5,>=3.12.0
  Using cached protobuf-4.21.11-cp310-abi3-win_amd64.whl (527 kB)
Collecting pyyaml<7,>=5.1
  Using cached PyYAML-6.0-cp310-cp310-

ERROR: Could not install packages due to an OSError: [WinError 2] Systém nemůže nalézt uvedený soubor: 'c:\\Python310\\Scripts\\tabulate.exe' -> 'c:\\Python310\\Scripts\\tabulate.exe.deleteme'

You should consider upgrading via the 'c:\Python310\python.exe -m pip install --upgrade pip' command.


### Load data

In [35]:
import pandas as pd
import numpy as np

In [36]:
# Load raw data
df = pd.read_csv("../data/lending_club.csv")

df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,...,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\r\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,...,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,...,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,...,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\r\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,...,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\r\nGreggshire, VA 11650"


### Prepare data

In [37]:
# Create binary label
df['loan_repaid'] = np.where(df['loan_status'] == 'Fully Paid', 1, 0)
df = df.drop(labels='loan_status', axis=1)

# Drop features
df = df.drop(labels=['emp_title', 'emp_length', 'title', 'grade', 'issue_d'] , axis=1)

# Fill-in mort acc
mortmeans = df.groupby(by='total_acc')['mort_acc'].mean()
def myfill(total,mort):
    if pd.isna(mort):
        return mortmeans[total]
    else:
        return mort

df['mort_acc'] = df.apply(lambda x: myfill(x.total_acc, x.mort_acc), axis=1)

# Drop rows with missing data
df = df.dropna()

# Convert term feature to number
def convert_term(x):
    if x == ' 36 months': return 36
    if x == ' 60 months': return 60

df['term'] = df['term'].apply(lambda x: convert_term(x))

# Home ownership feature - join categories
def homeownership(x):
    if (x == 'NONE') or (x == 'ANY'): return 'OTHER'
    else: return x

df['home_ownership'].apply(lambda x: homeownership(x)).unique()

# Parse zip and create dummy vars
df['address'] = df['address'].str[-5:]

# Parse year
df['earliest_cr_year'] = pd.to_numeric(df['earliest_cr_line'].str[-4:])
df = df.drop(labels='earliest_cr_line', axis=1)

df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,home_ownership,annual_inc,verification_status,purpose,dti,...,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address,loan_repaid,earliest_cr_year
0,10000.0,36,11.44,329.48,B4,RENT,117000.0,Not Verified,vacation,26.24,...,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,22690,1,1990
1,8000.0,36,11.99,265.68,B5,MORTGAGE,65000.0,Not Verified,debt_consolidation,22.05,...,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,5113,1,2004
2,15600.0,36,10.49,506.97,B3,RENT,43057.0,Source Verified,credit_card,12.79,...,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,5113,1,2007
3,7200.0,36,6.49,220.65,A2,RENT,54000.0,Not Verified,credit_card,2.6,...,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,813,1,2006
4,24375.0,60,17.27,609.33,C5,MORTGAGE,55000.0,Verified,credit_card,33.95,...,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,11650,0,1999


In [38]:
# Get dummies (OHE)
df = pd.get_dummies(columns=['sub_grade', 'verification_status', 'application_type', 'initial_list_status', 'purpose', 'home_ownership', 'address'], data=df,drop_first=True)

df.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'annual_inc', 'dti',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'mort_acc', 'pub_rec_bankruptcies', 'loan_repaid', 'earliest_cr_year',
       'sub_grade_A2', 'sub_grade_A3', 'sub_grade_A4', 'sub_grade_A5',
       'sub_grade_B1', 'sub_grade_B2', 'sub_grade_B3', 'sub_grade_B4',
       'sub_grade_B5', 'sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3',
       'sub_grade_C4', 'sub_grade_C5', 'sub_grade_D1', 'sub_grade_D2',
       'sub_grade_D3', 'sub_grade_D4', 'sub_grade_D5', 'sub_grade_E1',
       'sub_grade_E2', 'sub_grade_E3', 'sub_grade_E4', 'sub_grade_E5',
       'sub_grade_F1', 'sub_grade_F2', 'sub_grade_F3', 'sub_grade_F4',
       'sub_grade_F5', 'sub_grade_G1', 'sub_grade_G2', 'sub_grade_G3',
       'sub_grade_G4', 'sub_grade_G5', 'verification_status_Source Verified',
       'verification_status_Verified', 'application_type_INDIVIDUAL',
       'application_type_JOINT', 'initial_list_status_w',
  

In [39]:
df.count(axis=1)

0         81
1         81
2         81
3         81
4         81
          ..
396025    81
396026    81
396027    81
396028    81
396029    81
Length: 395219, dtype: int64

### Split data

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

Collecting scikit-learn
  Using cached scikit_learn-1.2.0-cp310-cp310-win_amd64.whl (8.2 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Using cached scipy-1.9.3-cp310-cp310-win_amd64.whl (40.1 MB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.0 scipy-1.9.3 threadpoolctl-3.1.0


You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


In [49]:
# Get features and labels
X = df.drop("loan_repaid",axis=1).values
y = df["loan_repaid"].values

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [51]:
np.savetxt("./X_train.csv", X_train, delimiter=",")
np.savetxt("./X_test.csv", X_test, delimiter=",")
np.savetxt("./y_train.csv", y_train, delimiter=",")
np.savetxt("./y_test.csv", y_test, delimiter=",")