In [1]:
import pandas as pd
import pymongo
from pymongo import MongoClient
import graphlab as gl
import csv
import collections
import datetime
from graphlab.toolkits.model_parameter_search import model_parameter_search
gl.canvas.set_target('ipynb')

In [2]:
client = MongoClient()
kiva = client.kiva
mongo_loans = kiva.loans
mongo_lenders_loans = kiva.lenders_loans

##Feature Engineering
This iteration will take into account several simple features: **activity, loan_amount, country, posted_date, sector**.

###Create lender loan matrix

In [4]:
# get loans_lenders
cursor_lenders_loans = mongo_lenders_loans.find({}, {'_id':0}).sort('lender_id', 1).limit(100000)
sample_lenders_loans = pd.DataFrame(list(cursor_lenders_loans))
sample_lenders_loans.dropna(inplace=True)

In [94]:
# create lender-loan pairs
with open('data/lenders_loans_100000.csv', 'w') as f:
    wr = csv.writer(f, delimiter=',')
    for r in sample_lenders_loans.iterrows():
        for l in r[1]['loan_ids']:
            wr.writerow([r[1]['lender_id'], l])

In [3]:
sf = gl.SFrame.read_csv('data/lenders_loans_100000.csv', header=False, delimiter=',', verbose=False)
sf.rename({'X1':'lender_id', 'X2':'loan_id'})
sf.sort('loan_id')

[INFO] Start server at: ipc:///tmp/graphlab_server-2743 - Server binary: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1425765020.log
[INFO] GraphLab Server Version: 1.3.0


------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/yulizhou/Documents/Projects/zipfian/Kipfian/data/lenders_loans_100000.csv
PROGRESS: Parsing completed. Parsed 1218194 lines in 0.80929 secs.


lender_id,loan_id
NkInteractClub,280242
am3914,280741
ake8008,280758
alistair8793,280758
barbara4752,280766
adrian9794,280766
am3914,280766
aaron83574977,280766
alistair8793,280776
andrew4225,280781


###Create side features

In [4]:
# start getting features

# get rid of loans in lender_loans but not in loans
feat_loan_id = sf['loan_id'].unique().sort()
ids = list(feat_loan_id)
temp_cursor = mongo_loans.find({'id': {'$in': ids}}, {'id': 1, '_id': 0})
loans_ids_exists = list(temp_cursor)
loans_ids_exists = sorted([l['id'] for l in loans_ids_exists])
diff_ids = set(ids) - set(loans_ids_exists)
sf['loan_id'] = sf['loan_id'].apply(lambda x: x if x not in diff_ids else None)
sf = sf.dropna('loan_id')

In [5]:
c=mongo_loans.find({'id': {'$in': loans_ids_exists}}, 
                   {'id': 1, 'activity': 1, 'loan_amount': 1, 'sector': 1, 'location.country':1, 'posted_date': 1, '_id':0})\
                    .sort('id', 1)
raw_features = gl.SFrame(list(c))

In [6]:
raw_features['loan_id'] = raw_features.apply(lambda x: x['X1']['id']).astype(str)

In [7]:
raw_features['activity'] = raw_features.apply(lambda x: x['X1']['activity'])

In [8]:
# get sector
raw_features['sector'] = raw_features.apply(lambda x: x['X1']['sector'])

In [9]:
# get loan_amount
raw_features['loan_amount'] = raw_features.apply(lambda x: x['X1']['loan_amount'])

In [10]:
# get country
raw_features['country'] = raw_features.apply(lambda x: x['X1']['location']['country'])

In [11]:
# get posted_date
raw_features['day_of_year'] = raw_features.apply(
    lambda x: pd.to_datetime(x['X1']['posted_date'], format='%Y-%m-%dT%H:%M:%SZ').timetuple().tm_yday)

In [12]:
# drop unused columns
raw_features.remove_column('X1')

loan_id,activity,sector,loan_amount,country,day_of_year
280242,Services,Services,725,Peru,102
280741,General Store,Retail,350,Philippines,110
280758,Livestock,Agriculture,600,Philippines,110
280766,General Store,Retail,575,Philippines,110
280776,Poultry,Agriculture,400,Philippines,110
280781,Fish Selling,Food,475,Philippines,110
280785,Food Production/Sales,Food,350,Philippines,110
280872,Fish Selling,Food,700,Nicaragua,103
281685,Pharmacy,Health,700,Sierra Leone,112
281858,General Store,Retail,2700,Philippines,110


## Train the model

In [13]:
# convert features into SFrame
loan_feature = gl.SFrame({'loan_id': list(raw_features['loan_id']), 
                          'activity': list(raw_features['activity']), 
                          'sector': list(raw_features['sector']), 
                          'loan_amount': list(raw_features['loan_amount']), 
                          'country': list(raw_features['country']), 
                          'day_of_year': list(raw_features['day_of_year'])})

In [14]:
loan_feature

activity,country,day_of_year,loan_amount,loan_id,sector
Services,Peru,102,725,280242,Services
General Store,Philippines,110,350,280741,Retail
Livestock,Philippines,110,600,280758,Agriculture
General Store,Philippines,110,575,280766,Retail
Poultry,Philippines,110,400,280776,Agriculture
Fish Selling,Philippines,110,475,280781,Food
Food Production/Sales,Philippines,110,350,280785,Food
Fish Selling,Nicaragua,103,700,280872,Food
Pharmacy,Sierra Leone,112,700,281685,Health
General Store,Philippines,110,2700,281858,Retail


In [15]:
# split train test
train, test = gl.recommender.util.random_split_by_user(sf, user_id='lender_id', item_id='loan_id', item_test_proportion=0.2)

In [16]:
# Train the model
models = []
regs = [0.1, 0.01, 0.001]
num_factors = range(2, 5)
for n in num_factors:
    for r in regs:
        m = gl.recommender.ranking_factorization_recommender.create(train,
                                                                    user_id='lender_id', 
                                                                    item_id='loan_id',
                                                                    item_data=loan_feature,
                                                                    num_factors=n,
                                                                    regularization=r, 
                                                                    binary_target=True, 
                                                                    verbose=True)
        models.append(m)

PROGRESS: Recsys training: model = ranking_factorization_recommender
PROGRESS: Preparing data set.
PROGRESS:     Data has 1214724 observations with 86321 users and 803205 items.
PROGRESS:     Data prepared in: 1.68393s
PROGRESS: Training ranking_factorization_recommender for recommendations.
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | Parameter                      | Description                                      | Value    |
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | num_factors                    | Factor Dimension                                 | 2        |
PROGRESS: | regularization                 | L2 Regularization on Factors                     | 0.1      |
PROGRESS: | solver                         | Solver used for training                         | adagrad  |
PROGRESS: | linear_regularization          | L2 Regularization on

In [17]:
# compare models
for i, m in enumerate(models):
    print '='*100
    print 'MODEL ', i
    m.evaluate(test, metric='precision_recall')

MODEL  0

Precision and recall summary statistics by cutoff
+--------+-------------------+------------------+
| cutoff |   mean_precision  |   mean_recall    |
+--------+-------------------+------------------+
|   5    |  0.00111317254174 | 0.00126998851489 |
|   10   | 0.000556586270872 | 0.00126998851489 |
|   15   | 0.000371057513915 | 0.00126998851489 |
+--------+-------------------+------------------+
[3 rows x 3 columns]

MODEL  1

Precision and recall summary statistics by cutoff
+--------+-------------------+------------------+
| cutoff |   mean_precision  |   mean_recall    |
+--------+-------------------+------------------+
|   5    |  0.00111317254174 | 0.00126998851489 |
|   10   | 0.000556586270872 | 0.00126998851489 |
|   15   | 0.000371057513915 | 0.00126998851489 |
+--------+-------------------+------------------+
[3 rows x 3 columns]

MODEL  2

Precision and recall summary statistics by cutoff
+--------+-------------------+------------------+
| cutoff |   mean_precisio

In [23]:
train.groupby(key_columns='lender_id', operations={'count': gl.aggregate.COUNT()}).sort('count', ascending=False)

lender_id,count
barbara5610,20782
amirali5409,17872
aaron83574977,10202
barrie7327,8346
andrewhoffman,6976
andrew5306,5694
JY1024,5196
anon23456,5106
ali9555,4912
FrequentMiler,4887
