In [1]:
import pandas as pd
import pymongo
from pymongo import MongoClient
import graphlab as gl
import csv
import collections
import datetime
from graphlab.toolkits.model_parameter_search import model_parameter_search
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
gl.canvas.set_target('ipynb')

[INFO] Start server at: ipc:///tmp/graphlab_server-3320 - Server binary: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1425768281.log
[INFO] GraphLab Server Version: 1.3.0


In [2]:
client = MongoClient()
kiva = client.kiva
mongo_loans = kiva.loans
mongo_lenders_loans = kiva.lenders_loans

##Feature Engineering
This iteration will consider only text features, i.e. Tfidf of loan description.

In [None]:
# get lenders_loans
cursor_lenders_loans = mongo_lenders_loans.find({}, {'_id':0}).sort('lender_id', 1).limit(100000)
sample_lenders_loans = pd.DataFrame(list(cursor_lenders_loans))
sample_lenders_loans.dropna(inplace=True)

In [None]:
# create lender-loan pairs
with open('data/lenders_loans_100000.csv', 'w') as f:
    wr = csv.writer(f, delimiter=',')
    for r in sample_lenders_loans.iterrows():
        for l in r[1]['loan_ids']:
            wr.writerow([r[1]['lender_id'], l])

In [3]:
sf = gl.SFrame.read_csv('data/lenders_loans_100000.csv', header=False, delimiter=',', verbose=False)
sf.rename({'X1':'lender_id', 'X2':'loan_id'})
sf.sort('loan_id')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/yulizhou/Documents/Projects/zipfian/Kipfian/data/lenders_loans_100000.csv
PROGRESS: Parsing completed. Parsed 1218194 lines in 0.83129 secs.


lender_id,loan_id
NkInteractClub,280242
am3914,280741
ake8008,280758
alistair8793,280758
barbara4752,280766
adrian9794,280766
am3914,280766
aaron83574977,280766
alistair8793,280776
andrew4225,280781


In [5]:
# start getting text
# we only use english version
# start getting features

# get rid of no-data loans
feat_loan_id = sf['loan_id'].unique().sort()
ids = list(feat_loan_id)
temp_cursor = mongo_loans.find({'id': {'$in': ids}, 'description.texts.en': {'$exists': True}}, {'id': 1, '_id': 0})
loans_ids_exists = list(temp_cursor)
loans_ids_exists = sorted([l['id'] for l in loans_ids_exists])
diff_ids = set(ids) - set(loans_ids_exists)
sf['loan_id'] = sf['loan_id'].apply(lambda x: x if x not in diff_ids else None)
sf = sf.dropna('loan_id')

In [4]:
c=mongo_loans.find({'id': {'$in': loans_ids_exists}}, 
                   {'id': 1, 'description.texts.en': 1, '_id':0})\
                   .sort('id', 1)
raw_features = pd.DataFrame(list(c))

NameError: name 'loans_ids_exists' is not defined

In [None]:
# define functions to lemmatize and vectorize text
def lemmatize_descriptions(descriptions):
    lem = WordNetLemmatizer()
    lemmatize = lambda d: " ".join(lem.lemmatize(word) for word in d.split())
    return [lemmatize(desc) for desc in descriptions]

def get_vectorizer(descriptions, num_features=4000):
    vect = TfidfVectorizer(max_features=num_features, stop_words='english')
    return vect.fit(descriptions)

In [None]:
raw_features['id'] = raw_features['id'].map(lambda x: str(x))

In [None]:
raw_features['description'] = raw_features['description'].map(lambda x: x['texts']['en'])

In [9]:
# create tfidf features
text = lemmatize_descriptions(raw_features['description'].values)
tfidf = pd.DataFrame(get_vectorizer(text).transform(text).toarray())

In [10]:
raw_features = raw_features.append(tfidf, ignore_index=True)

In [18]:
raw_features

Unnamed: 0,description,id
0,Nancy is 28 years old. She is a single mother ...,280242
1,"Isabelita G. is from the village of Bayabo 2, ...",280741
2,"Jocelyn M. is from the village of Bayabo, Tumu...",280758
3,"Flora A. is from the village of Bayabo, Tumuin...",280766
4,"Julita V. is from the village of Bayabo, Tumui...",280776
5,"Rowena G. is from the village of Bayabo, Tumau...",280781
6,"Marites P. is from the village of Bayabo, Tumu...",280785
7,"Niger, who is single, has had his own business...",280872
8,This is a 34 years-old Abu Bakarr. He is marri...,281685
9,"Claro Q. is from the village of Amobocan,Cauay...",281858


##Train models

In [None]:
# convert features into SFrame
# need a df!!!!!! because it's conveniet to convert to side information
loan_feature = gl.SFrame(raw_features.to_dict(orient='list'))

In [None]:
loan_feature

In [None]:
# split train test
train, test = gl.recommender.util.random_split_by_user(sf, user_id='lender_id', item_id='loan_id', item_test_proportion=0.2)

In [None]:
# Train the model
models = []
regs = [0.1, 0.01, 0.001]
num_factors = range(2, 5)
for n in num_factors:
    for r in regs:
        m = gl.recommender.ranking_factorization_recommender.create(train,
                                                                    user_id='lender_id', 
                                                                    item_id='loan_id',
                                                                    item_data=,
                                                                    num_factors=n,
                                                                    regularization=r, 
                                                                    binary_target=True, 
                                                                    verbose=True)
        models.append(m)

In [4]:
df = pd.read_csv('data/loans_lenders_100000.csv', header=None)

In [6]:
df.columns = ['loan_id', 'lender_id']

In [14]:
group_by_loan = df.groupby('loan_id').count().sort('lender_id', ascending=False)
group_by_loan

Unnamed: 0_level_0,lender_id
loan_id,Unnamed: 1_level_1
340316,322
340449,318
345747,315
348881,311
379677,310
344229,309
300405,296
388284,296
389670,292
362482,291


In [58]:
group_by_loan['lender_id'].mean()

23.432178331946798

In [15]:
group_by_lender = df.groupby('lender_id').count().sort('loan_id', ascending=False)
group_by_lender

Unnamed: 0_level_0,loan_id
lender_id,Unnamed: 1_level_1
zx81,18398
trolltech4460,17699
gooddogg1,14497
jamesclayton9485,7463
stephen3863,4532
joinFITE,4265
barbara5610,4026
emofund,3189
don9212,3095
laurence5353,2786
