# Kiva Project - Example Notebook

Predict whether a Kiva loan application will default.

- Stephen W. Thomas
- Used for MMAI 891.

# Preliminaries: Inspect and Set up environment

In [None]:
!which python

/usr/local/bin/python


In [None]:
!python --version

Python 3.7.10


In [None]:
!echo $PYTHONPATH

/env/python


In [None]:
pip install unidecode textstat



In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import string

In [None]:
import datetime
print(datetime.datetime.now())

2021-02-24 14:22:02.193844


# Read Data

We'll read the data from the links that Uncle Steve provided.

In [None]:
# The labeled training data
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1dzzVbgHphbCf7kvq9IKiIhwzmxPbuH4s")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6138 entries, 0 to 6137
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   loan_id    6138 non-null   int64 
 1   en_clean   6138 non-null   object
 2   defaulted  6138 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 144.0+ KB


# EDA

In [None]:
df.head()

Unnamed: 0,loan_id,en_clean,defaulted
0,7779,She opened a colmado out of the side of her ho...,0
1,2777,(First Loan): Joffre continues to run his loc...,1
2,6007,"Dina Santana is the mother of two children, Ju...",0
3,76,"Rosemary is 50 years old, single, and has 6 ch...",1
4,4217,"Segundo has a shop where he sells animal feed,...",0


In [None]:
df['defaulted'].value_counts()

0    3102
1    3036
Name: defaulted, dtype: int64

In [None]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# We can use spacy to show all the named entities in a given document.

doc = nlp(df.iloc[4001].en_clean)
displacy.render(doc, style="ent", jupyter=True)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = df['en_clean']
y = df['defaulted']

# So that we can evaluate how well our model is performing, we split our training data
# into training and validation.

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

# Feature Engineering and Extraction Pipeline

In [None]:
import re
import unidecode

# A nice preprocessing function that we can pass to CountVectorizer/TfidfVectorizer
def my_preprocess(doc):

    # Lowercase everything
    res = doc.lower()
    
    # Remove any "weird" characters
    res = unidecode.unidecode(res)

    # TODO: What else?

    return res

In [None]:
# These functions will calculate additional features on the document.
# They will be put into the Pipeline, called via the FunctionTransformer() function.
# Each one takes an entire corpus (as a list of documents), and should return
# an array of feature values (one for each document in the corpus).
# These functions can do anything they want; I've made most of them quick
# one-liners Hopefully the names of the functions will make them self explanitory.

def doc_length(corpus):
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import FunctionTransformer

# This vectorizer will be used to create the BOW features.
vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 100, 
                             use_idf=True)

rf = RandomForestClassifier(criterion='entropy', random_state=223)

# We will "union" together the BOW features and the custom-created features we
# created in the cell above.
feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('vectorizer', vectorizer), ])),
    ('doc_length', FunctionTransformer(doc_length, validate=False)),
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),  
])

pipe = Pipeline([('features', feature_processing), ('clf', rf)])

# Model Training/Tuning/Cross Validation


In [None]:
from sklearn.model_selection import GridSearchCV

# The names of the hypter parameters may look a bit funny; it's based on how they
# are added to the Pipeline object above (and seperated with double underscores)
param_grid = {
    'features__bow__vectorizer__max_features': [500, 1000],
    'features__bow__vectorizer__use_idf': [True, False],
    'clf__n_estimators': [10, 100],
}

search = GridSearchCV(pipe, 
                      param_grid, 
                      cv=10, 
                      n_jobs=5, 
                      scoring='f1_macro', 
                      return_train_score=True, 
                      verbose=2)

search = search.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   52.7s
[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed:  4.5min finished


In [None]:
print("Best parameter (CV score: %0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score: 0.83892):
{'clf__n_estimators': 100, 'features__bow__vectorizer__max_features': 1000, 'features__bow__vectorizer__use_idf': True}


In [None]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results

Unnamed: 0,clf__n_estimators,features__bow__vectorizer__max_features,features__bow__vectorizer__use_idf,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
6,100,1000,True,22.585341,0.378492,0.999807,9.7e-05,0.838921,0.009245,1
7,100,1000,False,22.197029,0.322462,0.999807,9.7e-05,0.835037,0.016848,2
5,100,500,False,23.220338,0.347646,0.999807,9.7e-05,0.825753,0.014224,3
4,100,500,True,23.761497,0.356861,0.999807,9.7e-05,0.824482,0.011324,4
3,10,1000,False,4.347159,0.275395,0.991742,0.000989,0.782153,0.019094,5
2,10,1000,True,4.417197,0.28137,0.992008,0.001077,0.778472,0.016039,6
0,10,500,True,4.538613,0.257739,0.992032,0.000794,0.772137,0.017156,7
1,10,500,False,4.423592,0.275062,0.991959,0.00108,0.763592,0.02343,8


# Model Assessment

In [None]:
y_val_pred = search.predict(X_val)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_true = y_val, y_pred = y_val_pred))

class_names = [str(x) for x in search.best_estimator_.classes_]
print(classification_report(y_true = y_val, y_pred = y_val_pred, target_names=class_names))

[[687  89]
 [190 569]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       776
           1       0.86      0.75      0.80       759

    accuracy                           0.82      1535
   macro avg       0.82      0.82      0.82      1535
weighted avg       0.82      0.82      0.82      1535



# Kaggle Predictions

In [None]:
# Read in the unlabeled testing data (for the Kaggle competition)
df_test = pd.read_csv("https://drive.google.com/uc?export=download&id=1EVWfyqQOd_W2uTKrr4JTD2iFrEZHoOHT")

In [None]:
# Use our pipeline to make predictions; then output predictions to a CSV file.

pred_test = search.predict(df_test['en_clean'])
my_submission = pd.DataFrame({'id': df_test['loan_id'], 'predicted': pred_test})
my_submission.head()

# This command will save the file to the local cloud instance; it will be deleted
# as soon as this Notebooks session ends.
my_submission.to_csv('my_submission.csv', index=False)

Unnamed: 0,id,predicted
0,6607,0
1,154,1
2,7402,0
3,2617,1
4,6464,0


In [None]:
# Download predictions file to your local computer

from google.colab import files
files.download('my_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(datetime.datetime.now())

2021-02-24 14:26:42.500223
