In [11]:
import pandas as pd
import pymongo
from pymongo import MongoClient
import graphlab as gl
import csv
import collections
import datetime
from graphlab.toolkits.model_parameter_search import model_parameter_search
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
gl.canvas.set_target('ipynb')

In [2]:
client = MongoClient()
kiva = client.kiva
mongo_loans = kiva.loans
mongo_loans_lenders = kiva.loans_lenders

##Feature Engineering
This iteration will consider only text features, i.e. Tfidf of loan description.

In [3]:
sf = gl.SFrame.read_csv('data/loans_lenders_500000.csv', header=False, delimiter=',', verbose=False)
sf['X1'] = sf['X1'].astype(str)
sf.rename({'X1':'loan_id', 'X2':'lender_id'}).show()

[INFO] Start server at: ipc:///tmp/graphlab_server-6873 - Server binary: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1425600841.log
[INFO] GraphLab Server Version: 1.3.0


------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Read 2746869 lines. Lines per second: 760057
PROGRESS: Finished parsing file /Users/yulizhou/Documents/Projects/zipfian/Kipfian/data/loans_lenders_500000.csv
PROGRESS: Parsing completed. Parsed 11679811 lines in 8.10371 secs.


<IPython.core.display.Javascript object>

In [10]:
# start getting text
# we only use english version
# get loans_lenders first
cursor_loans_lenders = mongo_loans_lenders.find({}, {'id': 1, '_id':0}).sort('id', 1).limit(500000)
df = pd.DataFrame(list(cursor_loans_lenders))
df.dropna(inplace=True)

ids = list(df['id'].values)
c=mongo_loans.find({'id': {'$in': ids}}, 
                   {'description.texts.en': 1, '_id':0})\
                    .sort('id', 1)
df['features'] = pd.Series(list(c), index=df.index)

ValueError: Wrong number of items passed 499783, placement implies 500000

In [15]:
len(ids)
c=mongo_loans.find({'id': {'$in': ids}}, 
                   {'description.texts.en': 1, '_id':0})\
                    .sort('id', 1)
data = list(c)

In [30]:
s = 0
for d in data:
    if 'en' not in d['description']['texts'] or not d['description']['texts']['en']:
        s += 1
s

31666

In [None]:
def lemmatize_descriptions(descriptions):
    lem = WordNetLemmatizer()
    lemmatize = lambda d: " ".join(lem.lemmatize(word) for word in d.split())
    return [lemmatize(desc) for desc in descriptions]


def get_vectorizer(descriptions, num_features=5000):
    vect = TfidfVectorizer(max_features=num_features, stop_words='english')
    return vect.fit(descriptions)

In [32]:
# start getting features
cursor_loans_lenders = mongo_loans_lenders.find({}, {'id': 1, '_id':0}).sort('id', 1).limit(500000)
df = pd.DataFrame(list(cursor_loans_lenders))
df.dropna(inplace=True)
ids = list(df['id'].values)
c=mongo_loans.find({'id': {'$in': ids}}, 
                   {'id': 1, 'activity': 1, 'loan_amount': 1, 'sector': 1, 'location.country':1, 'posted_date': 1, '_id':0})\
                    .sort('id', 1)
df['features'] = pd.Series(list(c), index=df.index)
# get activity
df['activity'] = df['features'].map(lambda x: x['activity'])
# get sector
df['sector'] = df['features'].map(lambda x: x['sector'])
# get loan_amount
df['loan_amount'] = df['features'].map(lambda x: x['loan_amount'])
# get country
df['country'] = df['features'].map(lambda x: x['location']['country'])
# get posted_date
df['date'] = df['features'].map(lambda x: pd.to_datetime(x['posted_date'], format='%Y-%m-%dT%H:%M:%SZ'))
df['day_of_year'] = df['date'].map(lambda x: x.timetuple().tm_yday)
# drop unused columns
df = df.drop(['features', 'date'], axis=1)
# convert features into SFrame
loan_feature = gl.SFrame({'loan_id': ids, 
                          'activity': list(df['activity']), 
                          'sector': list(df['sector']), 
                          'loan_amount': list(df['loan_amount']), 
                          'country': list(df['country']), 
                          'day_of_year': list(df['day_of_year'])})
# split train test
train, test = gl.recommender.util.random_split_by_user(sf, user_id='lender_id', item_id='loan_id', item_test_proportion=0.2)
# train models
job = model_parameter_search(gl.recommender.ranking_factorization_recommender.create, 
                             training_set=train,
                             validation_set=test,
                             user_id='lender_id',
                             item_id='loan_id',
                             item_data=loan_feature,
                             num_factors=[1],
                             regularization=[0.1], 
                             binary_target=True, 
                             verbose=True)
job_result = job.get_results()

ValueError: Wrong number of items passed 499783, placement implies 500000