In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "svm"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [2]:
kiva = pd.read_csv("C:/FinalProject/kiva_loans.csv")
mpi = pd.read_csv("C:/FinalProject/MPI_subnational.csv")

In [3]:
kiva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666977 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null object
disbursed_time        668809 non-null object
funded_time           622874 non-null object
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  499789 non-null object
borrower_genders      666984 non-null object
repayment_interval    671205 non-null object
date                  671205 non

In [4]:
kiva.describe()

Unnamed: 0,id,funded_amount,loan_amount,partner_id,term_in_months,lender_count
count,671205.0,671205.0,671205.0,657698.0,671205.0,671205.0
mean,993248.6,785.995061,842.397107,178.199616,13.739022,20.590922
std,196611.3,1130.398941,1198.660073,94.247581,8.598919,28.459551
min,653047.0,0.0,25.0,9.0,1.0,0.0
25%,823072.0,250.0,275.0,126.0,8.0,7.0
50%,992780.0,450.0,500.0,145.0,13.0,13.0
75%,1163653.0,900.0,1000.0,204.0,14.0,24.0
max,1340339.0,100000.0,100000.0,536.0,158.0,2986.0


In [5]:
mpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 8 columns):
ISO country code                     984 non-null object
Country                              984 non-null object
Sub-national region                  984 non-null object
World region                         984 non-null object
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(4), object(4)
memory usage: 61.6+ KB


In [6]:
mpi.rename(columns = {'Sub-national region':'region'}, inplace = True)

In [7]:
combo = pd.merge(kiva,mpi)

In [8]:
combo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Data columns (total 27 columns):
id                                   60158 non-null int64
funded_amount                        60158 non-null float64
loan_amount                          60158 non-null float64
activity                             60158 non-null object
sector                               60158 non-null object
use                                  60157 non-null object
country_code                         60158 non-null object
country                              60158 non-null object
region                               60158 non-null object
currency                             60158 non-null object
partner_id                           60158 non-null float64
posted_time                          60158 non-null object
disbursed_time                       60158 non-null object
funded_time                          55350 non-null object
term_in_months                       60158 non-null float64
len

In [9]:
#combo.to_csv("combo.csv", sep=',')

In [10]:
corr_matrix = combo.corr()

In [11]:
corr_matrix["MPI Regional"].sort_values(ascending=False)

MPI Regional                         1.000000
Headcount Ratio Regional             0.988552
Intensity of deprivation Regional    0.920503
MPI National                         0.803208
partner_id                           0.373586
id                                   0.077235
term_in_months                       0.048235
lender_count                        -0.169813
funded_amount                       -0.194878
loan_amount                         -0.197346
Name: MPI Regional, dtype: float64

In [12]:
combo["funded_amount"].equals(combo["loan_amount"]) #change one of them to int and try again!!!

False

In [13]:
combo = combo.drop(["date", "posted_time", "disbursed_time", "term_in_months", "use", "country_code", "partner_id", "funded_time",
            "currency", "lender_count", "tags", "date", "ISO country code", "Country", "id","Intensity of deprivation Regional" ], axis = 1)

In [14]:
data = pd.get_dummies(combo, columns = ["activity", "sector", "country", "region", "borrower_genders", "repayment_interval", "World region"])

In [15]:
data.info()# what is uint8?

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Columns: 1281 entries, funded_amount to World region_Sub-Saharan Africa
dtypes: float64(5), uint8(1276)
memory usage: 76.0 MB


In [16]:
data.describe()

Unnamed: 0,funded_amount,loan_amount,MPI National,MPI Regional,Headcount Ratio Regional,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,...,"borrower_genders_male, male, male, male, male, male, male, male, male, male, male",repayment_interval_bullet,repayment_interval_irregular,repayment_interval_monthly,World region_Arab States,World region_East Asia and the Pacific,World region_Europe and Central Asia,World region_Latin America and Caribbean,World region_South Asia,World region_Sub-Saharan Africa
count,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,...,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0
mean,827.266531,880.164733,0.150316,0.128607,26.47133,0.033661,3.3e-05,0.005585,5e-05,0.001313,...,1.7e-05,0.219971,0.157585,0.622444,0.012251,0.205908,0.023272,0.489262,0.000399,0.268909
std,1316.200651,1376.57021,0.114735,0.117477,20.568533,0.180357,0.005766,0.074526,0.007062,0.036215,...,0.004077,0.41423,0.364354,0.48478,0.110005,0.404367,0.150768,0.499889,0.01997,0.443396
min,0.0,25.0,0.006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,200.0,200.0,0.072,0.03,7.3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,525.0,600.0,0.113,0.09,20.9,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1000.0,1043.75,0.251,0.183,42.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
max,50000.0,50000.0,0.552,0.744,98.1,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
data['funded_amount'] = data["funded_amount"].astype(int)
data['loan_amount'] = data['loan_amount'].astype(int)
data['MPI National'] = data['MPI National'].astype(int)
data['MPI Regional'] = data['MPI Regional'].astype(int)
data['Headcount Ratio Regional'] = data['Headcount Ratio Regional'].astype(int)
#data['Intensity of deprivation Regional'] = data['Intensity of deprivation Regional'].astype(int)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Columns: 1281 entries, funded_amount to World region_Sub-Saharan Africa
dtypes: int32(5), uint8(1276)
memory usage: 74.8 MB


In [20]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=.6, random_state = 42)

In [21]:
train_x = train_set.drop("MPI Regional", axis = 1)
train_y = train_set["MPI Regional"]

In [22]:
test_x = test_set.drop("MPI Regional", axis = 1)
test_y = test_set["MPI Regional"]

In [23]:
#how do I do multiclass?

In [37]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(train_x,train_y)
#y_pred_rf = rnd_clf.predict(test_x)

In [40]:
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_cla = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(forest_cla, param_grid, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(train_x,train_y )
print (grid_search.best_params_)

{'max_features': 2, 'n_estimators': 3}


In [41]:
rnd_clf = RandomForestClassifier(n_estimators=3, max_features=2, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [42]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(rnd_clf, train_x, train_y, cv=3)

In [34]:
from sklearn.metrics import precision_score, recall_score

In [43]:
precision_score(train_y, y_train_pred)

0.0

In [44]:
recall_score(train_y,y_train_pred)

0.0