In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "svm"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [2]:
kiva = pd.read_csv("C:/kiva_loans.csv")
mpi = pd.read_csv("C:/MPI_subnational.csv")

In [3]:
kiva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666977 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null object
disbursed_time        668809 non-null object
funded_time           622874 non-null object
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  499789 non-null object
borrower_genders      666984 non-null object
repayment_interval    671205 non-null object
date                  671205 non

In [4]:
kiva.describe()

Unnamed: 0,id,funded_amount,loan_amount,partner_id,term_in_months,lender_count
count,671205.0,671205.0,671205.0,657698.0,671205.0,671205.0
mean,993248.6,785.995061,842.397107,178.199616,13.739022,20.590922
std,196611.3,1130.398941,1198.660073,94.247581,8.598919,28.459551
min,653047.0,0.0,25.0,9.0,1.0,0.0
25%,823072.0,250.0,275.0,126.0,8.0,7.0
50%,992780.0,450.0,500.0,145.0,13.0,13.0
75%,1163653.0,900.0,1000.0,204.0,14.0,24.0
max,1340339.0,100000.0,100000.0,536.0,158.0,2986.0


In [5]:
mpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 8 columns):
ISO country code                     984 non-null object
Country                              984 non-null object
Sub-national region                  984 non-null object
World region                         984 non-null object
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(4), object(4)
memory usage: 61.6+ KB


In [6]:
mpi.rename(columns = {'Sub-national region':'region'}, inplace = True)

In [7]:
combo = pd.merge(kiva,mpi)

In [8]:
combo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Data columns (total 27 columns):
id                                   60158 non-null int64
funded_amount                        60158 non-null float64
loan_amount                          60158 non-null float64
activity                             60158 non-null object
sector                               60158 non-null object
use                                  60157 non-null object
country_code                         60158 non-null object
country                              60158 non-null object
region                               60158 non-null object
currency                             60158 non-null object
partner_id                           60158 non-null float64
posted_time                          60158 non-null object
disbursed_time                       60158 non-null object
funded_time                          55350 non-null object
term_in_months                       60158 non-null float64
len

In [9]:
#combo.to_csv("combo.csv", sep=',')

In [10]:
corr_matrix = combo.corr()

In [11]:
corr_matrix["MPI Regional"].sort_values(ascending=False)

MPI Regional                         1.000000
Headcount Ratio Regional             0.988552
Intensity of deprivation Regional    0.920503
MPI National                         0.803208
partner_id                           0.373586
id                                   0.077235
term_in_months                       0.048235
lender_count                        -0.169813
funded_amount                       -0.194878
loan_amount                         -0.197346
Name: MPI Regional, dtype: float64

In [74]:
combo["funded_amount"].equals(combo["loan_amount"])

False

In [13]:
combo = combo.drop(["date", "posted_time", "disbursed_time", "term_in_months", "use", "country_code", "partner_id", "funded_time",
            "currency", "lender_count", "tags", "date", "ISO country code", "Country", "id","Intensity of deprivation Regional" ], axis = 1)

In [14]:
data = pd.get_dummies(combo, columns = ["activity", "sector", "country", "region", "borrower_genders", "repayment_interval", "World region"])

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Columns: 1281 entries, funded_amount to World region_Sub-Saharan Africa
dtypes: float64(5), uint8(1276)
memory usage: 76.0 MB


In [16]:
data.describe()

Unnamed: 0,funded_amount,loan_amount,MPI National,MPI Regional,Headcount Ratio Regional,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,...,"borrower_genders_male, male, male, male, male, male, male, male, male, male, male",repayment_interval_bullet,repayment_interval_irregular,repayment_interval_monthly,World region_Arab States,World region_East Asia and the Pacific,World region_Europe and Central Asia,World region_Latin America and Caribbean,World region_South Asia,World region_Sub-Saharan Africa
count,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,...,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0,60158.0
mean,827.266531,880.164733,0.150316,0.128607,26.47133,0.033661,3.3e-05,0.005585,5e-05,0.001313,...,1.7e-05,0.219971,0.157585,0.622444,0.012251,0.205908,0.023272,0.489262,0.000399,0.268909
std,1316.200651,1376.57021,0.114735,0.117477,20.568533,0.180357,0.005766,0.074526,0.007062,0.036215,...,0.004077,0.41423,0.364354,0.48478,0.110005,0.404367,0.150768,0.499889,0.01997,0.443396
min,0.0,25.0,0.006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,200.0,200.0,0.072,0.03,7.3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,525.0,600.0,0.113,0.09,20.9,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1000.0,1043.75,0.251,0.183,42.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
max,50000.0,50000.0,0.552,0.744,98.1,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Shows the different values within MPI Regional 

In [17]:
mpi_info = data['MPI Regional']
mpi_info.value_counts()

0.311    10000
0.030     4907
0.062     4067
0.021     3802
0.090     2383
0.121     2225
0.076     2149
0.107     2048
0.077     2048
0.120     1853
0.017     1729
0.182     1532
0.020     1338
0.031     1325
0.160     1146
0.026     1121
0.106     1073
0.043     1067
0.204      999
0.038      776
0.099      732
0.221      719
0.047      655
0.009      576
0.035      572
0.105      544
0.006      521
0.048      443
0.130      411
0.193      404
         ...  
0.100       11
0.195        9
0.050        7
0.243        6
0.373        6
0.431        6
0.161        5
0.322        5
0.010        4
0.093        4
0.301        4
0.007        4
0.000        3
0.089        3
0.293        3
0.051        3
0.134        2
0.086        2
0.087        1
0.025        1
0.124        1
0.013        1
0.332        1
0.005        1
0.003        1
0.041        1
0.016        1
0.171        1
0.075        1
0.123        1
Name: MPI Regional, Length: 117, dtype: int64

In [18]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data["MPI Regional"])

LabelEncoder()

In [19]:
le.classes_

array([ 0.   ,  0.003,  0.004,  0.005,  0.006,  0.007,  0.008,  0.009,
        0.01 ,  0.011,  0.012,  0.013,  0.014,  0.016,  0.017,  0.02 ,
        0.021,  0.024,  0.025,  0.026,  0.027,  0.03 ,  0.031,  0.035,
        0.036,  0.038,  0.041,  0.042,  0.043,  0.046,  0.047,  0.048,
        0.05 ,  0.051,  0.062,  0.065,  0.067,  0.069,  0.071,  0.073,
        0.075,  0.076,  0.077,  0.081,  0.082,  0.086,  0.087,  0.089,
        0.09 ,  0.093,  0.094,  0.099,  0.1  ,  0.103,  0.105,  0.106,
        0.107,  0.108,  0.112,  0.113,  0.114,  0.12 ,  0.121,  0.123,
        0.124,  0.127,  0.13 ,  0.134,  0.145,  0.16 ,  0.161,  0.168,
        0.171,  0.174,  0.182,  0.183,  0.184,  0.187,  0.193,  0.195,
        0.2  ,  0.204,  0.205,  0.207,  0.211,  0.215,  0.221,  0.243,
        0.244,  0.254,  0.259,  0.267,  0.283,  0.292,  0.293,  0.301,
        0.302,  0.306,  0.311,  0.314,  0.316,  0.322,  0.325,  0.332,
        0.334,  0.357,  0.373,  0.379,  0.388,  0.41 ,  0.431,  0.449,
      

In [20]:
data["MPI Regional"] = le.transform(data["MPI Regional"])

In [21]:
data["MPI Regional"]

0        22
1        22
2        22
3        22
4        22
5        22
6        22
7        22
8        22
9        22
10       22
11       22
12       22
13       22
14       22
15       22
16       22
17       22
18       22
19       22
20       22
21       22
22       22
23       22
24       22
25       22
26       22
27       22
28       22
29       22
         ..
60128    84
60129    84
60130    84
60131    84
60132    84
60133    84
60134    84
60135    84
60136    84
60137    84
60138    84
60139    84
60140    84
60141    84
60142    84
60143    84
60144    84
60145    84
60146    84
60147    84
60148    84
60149    84
60150    84
60151    84
60152    95
60153    95
60154    95
60155    95
60156     6
60157    13
Name: MPI Regional, Length: 60158, dtype: int64

### Converting floats to ints

In [22]:
data['funded_amount'] = data["funded_amount"].astype(int)
data['loan_amount'] = data['loan_amount'].astype(int)
#data['MPI National'] = data['MPI National'].astype(int)
#data['MPI Regional'] = data['MPI Regional'].astype(int)
data['Headcount Ratio Regional'] = data['Headcount Ratio Regional'].astype(int)
#data['Intensity of deprivation Regional'] = data['Intensity of deprivation Regional'].astype(int)

In [75]:
sample_incomplete_rows = data[data.isnull().any(axis=1)]
sample_incomplete_rows

Unnamed: 0,funded_amount,loan_amount,MPI National,MPI Regional,Headcount Ratio Regional,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,...,"borrower_genders_male, male, male, male, male, male, male, male, male, male, male",repayment_interval_bullet,repayment_interval_irregular,repayment_interval_monthly,World region_Arab States,World region_East Asia and the Pacific,World region_Europe and Central Asia,World region_Latin America and Caribbean,World region_South Asia,World region_Sub-Saharan Africa


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Columns: 1281 entries, funded_amount to World region_Sub-Saharan Africa
dtypes: float64(1), int32(3), int64(1), uint8(1276)
memory usage: 77.8 MB


### Train Test Split

In [49]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=.5, random_state = 42)

In [50]:
train_x = train_set.drop("MPI Regional", axis = 1)
train_y = train_set["MPI Regional"]

In [28]:
test_x = test_set.drop("MPI Regional", axis = 1)
test_y = test_set["MPI Regional"]

In [26]:
train_y.shape

(30079,)

In [27]:
train_y.value_counts()

98     5000
21     2422
34     1992
16     1939
48     1173
62     1111
41     1059
42     1031
56     1028
61      931
14      890
74      747
15      669
22      649
69      601
19      563
55      531
28      528
81      510
25      403
51      357
86      344
30      327
23      305
54      296
7       277
4       267
31      215
66      212
78      211
       ... 
97       14
9        11
76       10
39        9
104       9
88        8
52        8
108       7
90        7
79        5
87        4
32        3
95        3
70        2
67        2
8         2
5         2
110       2
106       2
49        2
33        2
26        1
63        1
3         1
101       1
94        1
46        1
72        1
13        1
0         1
Name: MPI Regional, Length: 109, dtype: int64

## Random Forest Classifier

In [73]:
# from sklearn.model_selection import GridSearchCV#'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
# param_grid = [
# {'n_estimators': [400, 500, 600], 'max_features': [4, 6, 8]},
# ]
# forest_cla = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(forest_cla, param_grid, cv=5)#scoring='neg_mean_squared_error'
# grid_search.fit(train_x,train_y )
# print (grid_search.best_params_)

In [66]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=2, max_leaf_nodes=16, n_jobs=-1, class_weight = 'balanced', random_state = 42)
rnd_clf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=16, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [67]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(rnd_clf, train_x, train_y, cv=3)



In [68]:
from sklearn.metrics import precision_score, recall_score

In [69]:
precision_score(train_y, y_train_pred, average = 'weighted')

  'precision', 'predicted', average, warn_for)


0.95870355160190801

In [70]:
recall_score(train_y,y_train_pred, average = 'weighted')

0.94105522125070651

In [71]:
y_test_pred = cross_val_predict(rnd_clf, test_x, test_y, cv=3)
precision_score(test_y, y_test_pred, average = 'weighted')

  'precision', 'predicted', average, warn_for)


0.95282330210269284

In [72]:
recall_score(test_y,y_test_pred, average = 'weighted')

0.93118122278001259