In [35]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "svm"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [36]:
kiva = pd.read_csv("C:/kiva_loans.csv")
mpi = pd.read_csv("C:/MPI_subnational.csv")

In [37]:
kiva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666977 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null object
disbursed_time        668809 non-null object
funded_time           622874 non-null object
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  499789 non-null object
borrower_genders      666984 non-null object
repayment_interval    671205 non-null object
date                  671205 non

In [38]:
kiva.describe()

Unnamed: 0,id,funded_amount,loan_amount,partner_id,term_in_months,lender_count
count,671205.0,671205.0,671205.0,657698.0,671205.0,671205.0
mean,993248.6,785.995061,842.397107,178.199616,13.739022,20.590922
std,196611.3,1130.398941,1198.660073,94.247581,8.598919,28.459551
min,653047.0,0.0,25.0,9.0,1.0,0.0
25%,823072.0,250.0,275.0,126.0,8.0,7.0
50%,992780.0,450.0,500.0,145.0,13.0,13.0
75%,1163653.0,900.0,1000.0,204.0,14.0,24.0
max,1340339.0,100000.0,100000.0,536.0,158.0,2986.0


In [39]:
mpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 8 columns):
ISO country code                     984 non-null object
Country                              984 non-null object
Sub-national region                  984 non-null object
World region                         984 non-null object
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(4), object(4)
memory usage: 61.6+ KB


In [40]:
mpi.rename(columns = {'Sub-national region':'region'}, inplace = True)

In [41]:
combo = pd.merge(kiva,mpi)

In [42]:
combo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Data columns (total 27 columns):
id                                   60158 non-null int64
funded_amount                        60158 non-null float64
loan_amount                          60158 non-null float64
activity                             60158 non-null object
sector                               60158 non-null object
use                                  60157 non-null object
country_code                         60158 non-null object
country                              60158 non-null object
region                               60158 non-null object
currency                             60158 non-null object
partner_id                           60158 non-null float64
posted_time                          60158 non-null object
disbursed_time                       60158 non-null object
funded_time                          55350 non-null object
term_in_months                       60158 non-null float64
len

In [43]:
#combo.to_csv("combo.csv", sep=',')

In [44]:
corr_matrix = combo.corr()

In [45]:
corr_matrix["MPI Regional"].sort_values(ascending=False)

MPI Regional                         1.000000
Headcount Ratio Regional             0.988552
Intensity of deprivation Regional    0.920503
MPI National                         0.803208
partner_id                           0.373586
id                                   0.077235
term_in_months                       0.048235
lender_count                        -0.169813
funded_amount                       -0.194878
loan_amount                         -0.197346
Name: MPI Regional, dtype: float64

In [46]:
combo["funded_amount"].equals(combo["loan_amount"]) #change one of them to int and try again!!!

False

In [50]:
combo = combo.drop(["date", "posted_time", "disbursed_time", "term_in_months", "use", "country_code", "partner_id", "funded_time",
            "currency", "lender_count", "tags", "date", "ISO country code", "Country", "id"], axis = 1)

In [51]:
data = pd.get_dummies(combo, columns = ["activity", "sector", "country", "region", "borrower_genders", "repayment_interval", "World region"])

In [64]:
data.info()# what is uint8?

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60158 entries, 0 to 60157
Columns: 1282 entries, funded_amount to World region_Sub-Saharan Africa
dtypes: float64(6), uint8(1276)
memory usage: 76.4 MB


In [55]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=.6, random_state = 42)

In [61]:
train_x = train_set.drop("MPI Regional", axis = 1)
train_y = train_set["MPI Regional"]

In [62]:
test_x = test_set.drop("MPI Regional", axis = 1)
test_y = test_set["MPI Regional"]

In [56]:
#how do I do multiclass?

In [63]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(train_x,train_y)

ValueError: Unknown label type: 'continuous'