In [2]:
import pandas as pd
import pymongo
from pymongo import MongoClient
import graphlab as gl
import csv
gl.canvas.set_target('ipynb')

In [3]:
client = MongoClient()
kiva = client.kiva

##Load data and features

In [4]:
# load loans_lenders data
loans_lenders = kiva.loans_lenders
cursor_loans_lenders = loans_lenders.find({'rnd': {'$lt': 0.309}}, {'_id':0, 'rnd':0}).limit(100000)
sample_loans_lenders = pd.DataFrame(list(cursor_loans_lenders))

KeyboardInterrupt: 

In [None]:
sample_loans_lenders = sample_loans_lenders.dropna()

In [None]:
sample_loans_lenders.info()

In [None]:
with open('data/loan_lender_100000.csv', 'w') as f:
    wr = csv.writer(f, delimiter=',')
    for r in sample_loans_lenders.iterrows():
        for l in r[1]['lender_ids']:
            wr.writerow([r[1]['id'], l, 1])

In [5]:
sf = gl.SFrame.read_csv('data/loan_lender_100000.csv', header=False, delimiter=',', verbose=True)
sf.rename({'X1':'loan_id', 'X2':'lender_id', 'X3':'lended'}).show()

[INFO] Start server at: ipc:///tmp/graphlab_server-2363 - Server binary: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1425504474.log
[INFO] GraphLab Server Version: 1.3.0


PROGRESS: Finished parsing file /Users/yulizhou/Documents/Projects/zipfian/Kipfian/data/loan_lender_100000.csv
PROGRESS: Parsing completed. Parsed 100 lines in 1.50567 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/yulizhou/Documents/Projects/zipfian/Kipfian/data/loan_lender_100000.csv
PROGRESS: Parsing completed. Parsed 2295765 lines in 1.66516 secs.


<IPython.core.display.Javascript object>

In [6]:
# split train test
train, test = gl.recommender.util.random_split_by_user(sf, user_id='lender_id', item_id='loan_id', item_test_proportion=0.2)

###Model selection
"If your data is implicit, i.e., you only observe interactions between users and items, without a rating, then use item_similarity_recommender with Jaccard similarity (default) or the ranking_factorization_recommender." [link](https://dato.com/learn/userguide/index.html#Modeling_data_Recommender_systems)

In [7]:
models = []
# train the model
m_iter0 = gl.recommender.item_similarity_recommender.create(train, user_id='lender_id', item_id='loan_id', target='lended')
models.append(m_iter0)

PROGRESS: Recsys training: model = item_similarity
PROGRESS: Preparing data set.
PROGRESS:     Data has 2294839 observations with 461730 users and 99491 items.
PROGRESS:     Data prepared in: 2.35078s
PROGRESS: Processing 2294839 observations:
PROGRESS:    Observations   Seconds
PROGRESS:         1000000   40.3618
PROGRESS:         2000000    83.933
PROGRESS:         2294839   96.0254
PROGRESS: Computing similar items for 99491 items using 8 partitions:
PROGRESS:  Partition     Items   Seconds
PROGRESS:          0     12446   97.2406
PROGRESS:          1     24883   98.6759
PROGRESS:          2     37320   100.197
PROGRESS:          3     49756    101.85
PROGRESS:          4     62185   103.294
PROGRESS:          5     74616   104.686
PROGRESS:          6     87052   106.122
PROGRESS:          7     99491   107.643
PROGRESS: Finished getting similar items for 99491 items in 107.644s.
PROGRESS: Finished training in 112.554s
PROGRESS: Finished prediction in 4.86731s


In [9]:
# Try factorization
regs = [0.1, 0.01, 0.001, 0.0001]
for r in regs:
    m = gl.recommender.ranking_factorization_recommender.create(train,
                                                                user_id='lender_id', 
                                                                item_id='loan_id', 
                                                                target='lended',
                                                                regularization=r, 
                                                                binary_target=True)
    models.append(m)

PROGRESS: Recsys training: model = ranking_factorization_recommender
PROGRESS: Preparing data set.
PROGRESS:     Data has 2294839 observations with 461730 users and 99491 items.
PROGRESS:     Data prepared in: 2.26169s
PROGRESS: Training ranking_factorization_recommender for recommendations.
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | Parameter                      | Description                                      | Value    |
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | num_factors                    | Factor Dimension                                 | 32       |
PROGRESS: | regularization                 | L2 Regularization on Factors                     | 0.1      |
PROGRESS: | solver                         | Solver used for training                         | sgd      |
PROGRESS: | linear_regularization          | L2 Regularization on

In [10]:
# test
for i, m in enumerate(models):
    print '='*100
    print 'MODEL ', i
    m.evaluate(test, metric='rmse')

MODEL  0
PROGRESS: Finished prediction in 0.32626s

Overall RMSE:  37.1190924049

Per User RMSE (best)
+-----------+-------+------+
| lender_id | count | rmse |
+-----------+-------+------+
|  ann5966  |   1   | 0.0  |
+-----------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+------------+-------+-------+
| lender_id  | count |  rmse |
+------------+-------+-------+
| bonnie5150 |   1   | 449.0 |
+------------+-------+-------+
[1 rows x 3 columns]


Per Item RMSE (best)
+---------+-------+------+
| loan_id | count | rmse |
+---------+-------+------+
|  588721 |   1   | 0.0  |
+---------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+---------+-------+-------+
| loan_id | count |  rmse |
+---------+-------+-------+
|  429961 |   1   | 449.0 |
+---------+-------+-------+
[1 rows x 3 columns]

MODEL  1

Overall RMSE:  0.498841606832

Per User RMSE (best)
+-------------+-------+----------------+
|  lender_id  | count |      rmse      |
+-------------+-------

In [11]:
# Take a look at recommendations for the same user
for i, m in enumerate(models):
    print '='*100
    print 'MODEL ', i
    print m.recommend(users=['andrew9629'])

MODEL  0
+------------+---------+---------------+------+
| lender_id  | loan_id |     score     | rank |
+------------+---------+---------------+------+
| andrew9629 |  557391 | 1.01470588235 |  1   |
| andrew9629 |  713925 | 1.01041666667 |  2   |
| andrew9629 |  440534 | 1.00729927007 |  3   |
| andrew9629 |  566724 |      1.0      |  4   |
| andrew9629 |  446847 |      1.0      |  5   |
| andrew9629 |  718438 |      1.0      |  6   |
| andrew9629 |  377780 |      1.0      |  7   |
| andrew9629 |  531239 |      1.0      |  8   |
| andrew9629 |  583313 |      1.0      |  9   |
| andrew9629 |  794188 |      1.0      |  10  |
+------------+---------+---------------+------+
[10 rows x 4 columns]

MODEL  1
+------------+---------+----------------+------+
| lender_id  | loan_id |     score      | rank |
+------------+---------+----------------+------+
| andrew9629 |  634057 | 0.508306141745 |  1   |
| andrew9629 |  798219 | 0.504968842912 |  2   |
| andrew9629 |  812994 | 0.503862603415 | 