In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Peer-to-peer lending
## [Lending club](https://www.lendingclub.com/)  data: https://www.kaggle.com/wendykan/lending-club-loan-data/
## Regression problem: predicting interest rates

In [2]:
num_examples = 30000

In [3]:
df = pd.read_csv('loan_head.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,


In [5]:
# keep only the columns we are interested in
df1 = df.loc[:, ['loan_amnt', 'funded_amnt', 'term', 'grade', 'emp_title', 'emp_length', 'int_rate']]
df1.head()

Unnamed: 0,loan_amnt,funded_amnt,term,grade,emp_title,emp_length,int_rate
0,2500,2500,36 months,C,Chef,10+ years,13.56
1,30000,30000,60 months,D,Postmaster,10+ years,18.94
2,5000,5000,36 months,D,Administrative,6 years,17.97
3,4000,4000,36 months,D,IT Supervisor,10+ years,18.94
4,30000,30000,60 months,C,Mechanic,10+ years,16.14


### Drop rows with missing values
Sometimes the fact that a value is missing can be also important, and sometimes we have to keep these rows and impute the missing values. Here we just drop missing values to make our life simpler.

In [6]:
df1.shape

(99999, 7)

In [7]:
df1.notna()[:10]

Unnamed: 0,loan_amnt,funded_amnt,term,grade,emp_title,emp_length,int_rate
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True
9,True,True,True,True,False,True,True


In [8]:
# dropping rows with missing values
df2 = df1.dropna()

In [21]:
df2[:10]

Unnamed: 0,loan_amnt,funded_amnt,term,grade,emp_title,emp_length,int_rate
0,2500,2500,36 months,C,Chef,10+ years,13.56
1,30000,30000,60 months,D,Postmaster,10+ years,18.94
2,5000,5000,36 months,D,Administrative,6 years,17.97
3,4000,4000,36 months,D,IT Supervisor,10+ years,18.94
4,30000,30000,60 months,C,Mechanic,10+ years,16.14
5,5550,5550,36 months,C,Director COE,10+ years,15.02
6,2000,2000,36 months,D,Account Manager,4 years,17.97
7,6000,6000,36 months,C,Assistant Director,10+ years,13.56
8,5000,5000,36 months,D,Legal Assistant III,10+ years,17.97
11,28000,28000,60 months,B,Consultant,2 years,11.31


In [9]:
# we have lost quite a few data points
df2.shape

(83463, 7)

In [10]:
# but we're going to use even less examples
df2 = df2[:num_examples]

## Producing training and test data

In [11]:
y = df2.loc[:, 'int_rate'].values
y[:10]

array([13.56, 18.94, 17.97, 18.94, 16.14, 15.02, 17.97, 13.56, 17.97,
       11.31])

In [12]:
y.shape

(30000,)

### Encoding the features

In [18]:
X_raw = ({'loan_amnt': loan_amnt,
          'funded_amnt': funded_amnt,
          'term': term,
          'grade': grade,
          'emp_title': emp_title,
          'emp_length': emp_length}
         for loan_amnt, funded_amnt, term, grade, emp_title, emp_length in
         df2.loc[:, ['loan_amnt', 'funded_amnt', 'term', 'grade', 'emp_title', 'emp_length']].values)

In [19]:
next(X_raw)

{'loan_amnt': 2500,
 'funded_amnt': 2500,
 'term': ' 36 months',
 'grade': 'C',
 'emp_title': 'Chef',
 'emp_length': '10+ years'}

In [20]:
# we have to do it again as we've consumed one data point from the generator above
X_raw = ({'loan_amnt': loan_amnt,
          'funded_amnt': funded_amnt,
          'term': term,
          'grade': grade,
          'emp_title': emp_title,
          'emp_length': emp_length}
         for loan_amnt, funded_amnt, term, grade, emp_title, emp_length in
         df2.loc[:, ['loan_amnt', 'funded_amnt', 'term', 'grade', 'emp_title', 'emp_length']].values)

In [21]:
vectorizer = DictVectorizer()

In [22]:
X = vectorizer.fit_transform(X_raw)

In [23]:
X

<30000x15347 sparse matrix of type '<class 'numpy.float64'>'
	with 180000 stored elements in Compressed Sparse Row format>

In [25]:
X[0].toarray()

array([[0.0e+00, 1.0e+00, 0.0e+00, ..., 2.5e+03, 1.0e+00, 0.0e+00]])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

In [27]:
X_train.shape, y_train.shape

((27000, 15347), (27000,))

## Building the model and training

In [28]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [29]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(40, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(40, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(1) # we don't have an activation here as we're doing regression
])

$\operatorname {MSE} ={\frac {1}{n}}\sum _{i=1}^{n}(y_{i}-{\hat {y_{i}}})^{2}$

$\mathrm {MAE} ={\frac {\sum _{i=1}^{n}\left|y_{i}-\hat {y_{i}}\right|}{n}}$

In [30]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001),
              loss='mean_squared_error',
              metrics=['mean_squared_error', 'mean_absolute_error'])

In [31]:
model.fit(X_train, y_train, epochs=50, validation_split=0.1)

Train on 24300 samples, validate on 2700 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50


Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1a3003a940>

In [32]:
model.evaluate(X_test, y_test)



[1.791646253267924, 1.791646253267924, 1.0323849862416585]

# try:
1. other features
2. different neural network architectures
3. more data points
4. different problems (for example, is a loan good or bad?)