In [14]:
import pandas as pd
import pymongo
from pymongo import MongoClient
import graphlab as gl
import csv
import collections
import datetime
from graphlab.toolkits.model_parameter_search import model_parameter_search
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
gl.canvas.set_target('ipynb')

In [15]:
client = MongoClient()
kiva = client.kiva
mongo_loans = kiva.loans
mongo_lenders_loans = kiva.lenders_loans

##Feature Engineering
This iteration will consider only text features, i.e. Tfidf of loan description.

In [None]:
# get lenders_loans
cursor_lenders_loans = mongo_lenders_loans.find({}, {'_id':0}).sort('lender_id', 1).limit
sample_lenders_loans = pd.DataFrame(list(cursor_lenders_loans))
sample_lenders_loans.dropna(inplace=True)

In [None]:
# create lender-loan pairs
with open('data/lenders_loans_100000.csv', 'w') as f:
    wr = csv.writer(f, delimiter=',')
    for r in sample_lenders_loans.iterrows():
        for l in r[1]['loan_ids']:
            wr.writerow([r[1]['lender_id'], l])

In [16]:
sf = gl.SFrame.read_csv('data/lenders_loans_100000.csv', header=False, delimiter=',', verbose=False)
sf.rename({'X1':'lender_id', 'X2':'loan_id'})
sf.sort('loan_id')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/yulizhou/Documents/Projects/zipfian/Kipfian/data/lenders_loans_100000.csv
PROGRESS: Parsing completed. Parsed 1218194 lines in 0.781758 secs.


lender_id,loan_id
NkInteractClub,280242
am3914,280741
ake8008,280758
alistair8793,280758
barbara4752,280766
adrian9794,280766
am3914,280766
aaron83574977,280766
alistair8793,280776
andrew4225,280781


In [17]:
# start getting text
# we only use english version
# start getting features

# get rid of no-data loans
feat_loan_id = sf['loan_id'].unique().sort()
ids = list(feat_loan_id)
temp_cursor = mongo_loans.find({'id': {'$in': ids}, 'description.texts.en': {'$exists': True}}, {'id': 1, '_id': 0})
loans_ids_exists = list(temp_cursor)
loans_ids_exists = sorted([l['id'] for l in loans_ids_exists])
diff_ids = set(ids) - set(loans_ids_exists)
sf['loan_id'] = sf['loan_id'].apply(lambda x: x if x not in diff_ids else None)
sf = sf.dropna('loan_id')

In [29]:
c=mongo_loans.find({'id': {'$in': loans_ids_exists}}, 
                   {'id': 1, 'description.texts.en': 1, '_id':0})\
                   .sort('id', 1)
raw_features = pd.DataFrame(list(c))

In [30]:
# define functions to lemmatize and vectorize text
def lemmatize_descriptions(descriptions):
    lem = WordNetLemmatizer()
    lemmatize = lambda d: " ".join(lem.lemmatize(word) for word in d.split())
    return [lemmatize(desc) for desc in descriptions]

def get_vectorizer(descriptions, num_features=2000):
    vect = TfidfVectorizer(max_features=num_features, stop_words='english')
    return vect.fit(descriptions)

In [31]:
raw_features['id'] = raw_features['id'].map(lambda x: str(x))

In [32]:
raw_features['description'] = raw_features['description'].map(lambda x: x['texts']['en'])

In [33]:
# create tfidf features
text = lemmatize_descriptions(raw_features['description'].values)
tfidf = pd.DataFrame(get_vectorizer(text).transform(text).toarray())

KeyboardInterrupt: 

In [None]:
tfidf

In [10]:
raw_features = raw_features.append(tfidf, ignore_index=True)

##Train models

In [None]:
# convert features into SFrame
loan_feature = gl.SFrame(raw_features.to_dict(orient='list'))

In [None]:
loan_feature

In [None]:
# split train test
train, test = gl.recommender.util.random_split_by_user(sf, user_id='lender_id', item_id='loan_id', item_test_proportion=0.2)

In [None]:
# Train the model
models = []
regs = [0.1, 0.01, 0.001]
num_factors = range(2, 5)
for n in num_factors:
    for r in regs:
        m = gl.recommender.ranking_factorization_recommender.create(train,
                                                                    user_id='lender_id', 
                                                                    item_id='loan_id',
                                                                    item_data=loan_features,
                                                                    num_factors=n,
                                                                    regularization=r, 
                                                                    binary_target=True, 
                                                                    verbose=True)
        models.append(m)

In [4]:
df = pd.read_csv('data/loans_lenders_100000.csv', header=None)

In [6]:
df.columns = ['loan_id', 'lender_id']

In [14]:
group_by_loan = df.groupby('loan_id').count().sort('lender_id', ascending=False)
group_by_loan

Unnamed: 0_level_0,lender_id
loan_id,Unnamed: 1_level_1
340316,322
340449,318
345747,315
348881,311
379677,310
344229,309
300405,296
388284,296
389670,292
362482,291


In [58]:
group_by_loan['lender_id'].mean()

23.432178331946798

In [15]:
group_by_lender = df.groupby('lender_id').count().sort('loan_id', ascending=False)
group_by_lender

Unnamed: 0_level_0,loan_id
lender_id,Unnamed: 1_level_1
zx81,18398
trolltech4460,17699
gooddogg1,14497
jamesclayton9485,7463
stephen3863,4532
joinFITE,4265
barbara5610,4026
emofund,3189
don9212,3095
laurence5353,2786


In [37]:
sf.groupby(key_columns='lender_id', operations={'count': gl.aggregate.COUNT()}).sort('count', ascending=False).print_rows(num_rows=10000)

+--------------------------+-------+
|        lender_id         | count |
+--------------------------+-------+
|       barbara5610        | 20779 |
|       amirali5409        | 17871 |
|      aaron83574977       | 10201 |
|        barrie7327        |  8345 |
|      andrewhoffman       |  6976 |
|        andrew5306        |  5694 |
|          JY1024          |  5196 |
|        anon23456         |  5106 |
|         ali9555          |  4912 |
|      FrequentMiler       |  4887 |
|          123321          |  4358 |
|       Oliver10303        |  4195 |
|         amin3839         |  4117 |
|         alan5513         |  4089 |
|        angela7509        |  4026 |
|          anuzis          |  3903 |
|    barryanddenise6754    |  3783 |
|       anthony5914        |  3344 |
|          94704           |  3231 |
|         andy1021         |  3199 |
|           aisk           |  2789 |
|  EntrepreneursForAfrica  |  2740 |
|           WHYu           |  2628 |
|        3beditions        |  2619 |
|