In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphlab as gl
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import NMF
import string
%matplotlib inline

In [22]:
loans = pd.read_csv('data/loans.csv')

In [50]:
# pick loans which have over 400 lenders
# since the lowest amount to lend is $25, 400*25=$10000
df = loans[['sector', 'descriptions', 'use', 'lender_count', 'loan_amount']]
df = df[df['sector']!='sector']

In [51]:
df['lender_count'] = df['lender_count'].astype(int)
df['loan_amount'] = df['loan_amount'].astype(int)
df.dropna(inplace=True)
df = df[df['lender_count'] > 100]

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18237 entries, 156 to 842804
Data columns (total 5 columns):
sector          18237 non-null object
descriptions    18237 non-null object
use             18237 non-null object
lender_count    18237 non-null int64
loan_amount     18237 non-null int64
dtypes: int64(2), object(3)
memory usage: 854.9+ KB


In [59]:
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    snowball = SnowballStemmer('english')
    punc = set(string.punctuation)
    return [snowball.stem(word) for word in word_tokenize(doc.lower()) if word not in punc]

def get_vectorizer(descriptions, num_features=2000):
    vect = TfidfVectorizer(max_features=num_features, stop_words='english', tokenizer=tokenize)
    return vect.fit(descriptions)

##Description topics

In [116]:
# create tfidf of descriptions
desc = df.sort('lender_count', ascending=False)['descriptions'].values
vec_desc = get_vectorizer(desc)
tfidf_desc = vec_desc.transform(desc).toarray()

In [137]:
vec_desc.transform(desc)

<18237x2000 sparse matrix of type '<type 'numpy.float64'>'
	with 1116292 stored elements in Compressed Sparse Row format>

In [153]:
# run NMF and get topics
topic_words = []
nmf_desc = NMF(n_components=15).fit(tfidf_desc)
feature_names = vec_desc.get_feature_names()

In [154]:
num_top_words = 25
for topic in nmf_desc.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([feature_names[i] for i in word_idx])

In [155]:
topic_words

[[u'busi',
  u'hope',
  u'shop',
  u'profit',
  u'open',
  u'custom',
  u'week',
  u'good',
  u'receiv',
  u'loan',
  u'oper',
  u'age',
  u'year',
  u'expand',
  u'develop',
  u'say',
  u'rdc',
  u'kinshasa',
  u'retail',
  u'store',
  u'manag',
  u'unit',
  u'mfi',
  u'start',
  u'run'],
 [u'pro',
  u'mujer',
  u'sale',
  u'sell',
  u'member',
  u'like',
  u'busi',
  u'invit',
  u'board',
  u'communal',
  u'cycl',
  u'institut',
  u'join',
  u'ask',
  u'benefit',
  u'bank',
  u'say',
  u'director',
  u'blanket',
  u'presid',
  u'capit',
  u'alto',
  u'generat',
  u'ago',
  u'allow'],
 [u'acr',
  u'farmer',
  u'fund',
  u'harvest',
  u'group',
  u'seed',
  u'repres',
  u'leader',
  u'fertil',
  u'\xbc',
  u'farm',
  u'season',
  u'kiva',
  u'sorghum',
  u'save',
  u'cost',
  u'millet',
  u'maiz',
  u'communiti',
  u'support',
  u'pictur',
  u'photo',
  u'profit',
  u'price',
  u'plant'],
 [u'loan',
  u'tanzania',
  u'tujijeng',
  u'hold',
  u'account',
  u'pay',
  u'abl',
  u'share',


In [156]:
# what're the topics for those top loans?
desc_transformed = nmf_desc.transform(tfidf_desc)[:20]

In [157]:
for t in range(len(topic_words)):
    print("Topic {}: {}".format(t, ', '.join([w.encode('utf-8') for w in topic_words[t][:15]])))

Topic 0: busi, hope, shop, profit, open, custom, week, good, receiv, loan, oper, age, year, expand, develop
Topic 1: pro, mujer, sale, sell, member, like, busi, invit, board, communal, cycl, institut, join, ask, benefit
Topic 2: acr, farmer, fund, harvest, group, seed, repres, leader, fertil, ¼, farm, season, kiva, sorghum, save
Topic 3: loan, tanzania, tujijeng, hold, account, pay, abl, share, daili, month, paid, previous, total, use, profit.thi
Topic 4: year, children, school, turam, old, group, capit, rice, buy, marri, sell, age, larg, increas, loan
Topic 5: villag, farm, husband, cultiv, agricultur, famili, incom, cow, livestock, pig, help, son, cattl, live, fertil
Topic 6: p, br, /p, translat, /strong, strong, volunt, spanish, member, kiva, group, sell, product, food, sale
Topic 7: esperanza, busi, group, coordin, invest, use, loan, excit, dominican, plan, structur, simpl, water, home, cloth
Topic 8: banc, group, villageoi, women, right, franc, hand, rais, creat, villag, photo, st

In [165]:
concise_topics_desc = ['[business] open a store and payback',
                       '[business] formal business needs investment',
                       '[farming] preparing for the season',
                       'Tujijenge in Tanzania',
                       '[family] send children to school',
                       '[farming] farming and livestock in a village',
                       'With translation',
                       '[business] Latin America borrowing for business',
                       '[business] women group, small business',
                       '[family]Central Asians supporting family',
                       'Iraq',
                       '[farming] community buys and sells livestocks',
                       '[business] women group, small business',
                       '[business] initial funding to start business',
                       '[business] entrepreneur starts business to help children']

In [166]:
for i in range(len(desc_transformed)):
    top_topics = np.argsort(desc_transformed[i,:])[::-1][0]
    print("{}: {}".format(i+1, concise_topics_desc[top_topics]))

1: [business] Latin America borrowing for business
2: [business] initial funding to start business
3: [farming] preparing for the season
4: [business] open a store and payback
5: [farming] preparing for the season
6: [business] women group, small business
7: [farming] preparing for the season
8: [business] initial funding to start business
9: [farming] preparing for the season
10: [farming] preparing for the season
11: [business] Latin America borrowing for business
12: [business] open a store and payback
13: [business] women group, small business
14: Iraq
15: [business] women group, small business
16: With translation
17: [business] women group, small business
18: [business] women group, small business
19: [business] open a store and payback
20: [business] women group, small business


##'Use' topics

In [108]:
# create tfidf of descriptions
use = df.sort('lender_count', ascending=False)['use'].values
vec_use = get_vectorizer(use)
tfidf_use = vec_use.transform(use).toarray()

In [109]:
# run NMF and get topics
topic_words_use = []
nmf_use = NMF(n_components=15).fit(tfidf_use)
feature_names_use = vec_use.get_feature_names()

In [110]:
for topic in nmf_use.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words_use.append([feature_names_use[i] for i in word_idx])

In [111]:
topic_words_use

[[u'cloth',
  u'shoe',
  u'buy',
  u'cosmet',
  u'use',
  u'bale',
  u"'s",
  u'new',
  u'women',
  u'season',
  u'second-hand',
  u'winter',
  u'children',
  u'perfum',
  u'accessori',
  u'sale',
  u'bundl',
  u'miscellan',
  u'pant',
  u'men'],
 [u'purchas',
  u'materi',
  u'product',
  u'build',
  u'equip',
  u'inventori',
  u'addit',
  u'insul',
  u'suppli',
  u'raw',
  u'make',
  u'new',
  u'pesticid',
  u'bulk',
  u'cow',
  u'construct',
  u'machin',
  u'car',
  u'hous',
  u'thread'],
 [u'capit',
  u'work',
  u'oper',
  u'increas',
  u'larg',
  u'quantiti',
  u'order',
  u'earn',
  u'respect',
  u'area',
  u'reinforc',
  u'make',
  u'lot',
  u'money',
  u'banana',
  u'use',
  u'profit',
  u'mani',
  u'addit',
  u'expens'],
 [u'buy',
  u'fabric',
  u'suppli',
  u'product',
  u'materi',
  u'wholesal',
  u'thread',
  u'make',
  u'wool',
  u'sew',
  u'cosmet',
  u'machin',
  u'ingredi',
  u'shoe',
  u'beauti',
  u'groceri',
  u'food',
  u'cattl',
  u'salon',
  u'bulk'],
 [u'busi',
  

In [112]:
# what're the topics for those top loans?
use_transformed = nmf_use.transform(tfidf_use)[:20]

In [114]:
for t in range(len(topic_words_use)):
    print("Topic {}: {}".format(t, ', '.join([w.encode('utf-8') for w in topic_words_use[t][:15]])))

Topic 0: cloth, shoe, buy, cosmet, use, bale, 's, new, women, season, second-hand, winter, children, perfum, accessori
Topic 1: purchas, materi, product, build, equip, inventori, addit, insul, suppli, raw, make, new, pesticid, bulk, cow
Topic 2: capit, work, oper, increas, larg, quantiti, order, earn, respect, area, reinforc, make, lot, money, banana
Topic 3: buy, fabric, suppli, product, materi, wholesal, thread, make, wool, sew, cosmet, machin, ingredi, shoe, beauti
Topic 4: busi, invest, new, expand, start, suppli, inventori, respect, equip, grow, market, machin, improv, small, restock
Topic 5: merchandis, store, buy, general, groceri, raw, invest, season, restock, like, milk, christma, purchas, various, special
Topic 6: veget, fruit, meat, buy, season, condiment, bulk, oil, stall, wholesal, organ, varieti, sale, chicken, charcoal
Topic 7: resal, larg, shoe, quantiti, produc, buy, banana, bean, pair, earn, item, fatten, good, fish, beer
Topic 8: pay, fee, school, children, tuition, 

In [131]:
concise_topic_use = ['[family]buy clothes for women and children',
                     '[business]buy materials to make products',
                     '[business]expanding business',
                     '[living]buy materials to make fabric products to sell',
                     '[business]expanding business',
                     '[business]restock inventories',
                     '[business]buy/sell seasonal food products',
                     '[business]resale larget quantity of products',
                     '[family]pay for school',
                     '[living]raise livestock',
                     '[living]groceries for living',
                     '[business]resale larget quantity of products',
                     '[living]fertilize and farming operations',
                     '[living]selling food product',
                     '[business]grocery and retail business']

In [132]:
for i in range(len(use_transformed)):
    top_topics = np.argsort(use_transformed[i,:])[::-1][0]
    print("{}: {}".format(i, concise_topic_use[top_topics]))

0: [living]fertilize and farming operations
1: [living]selling food product
2: [living]selling food product
3: [business]expanding business
4: [living]selling food product
5: [living]selling food product
6: [living]selling food product
7: [family]pay for school
8: [business]expanding business
9: [living]selling food product
10: [business]buy materials to make products
11: [business]grocery and retail business
12: [living]selling food product
13: [family]pay for school
14: [living]buy materials to make fabric products to sell
15: [living]buy materials to make fabric products to sell
16: [living]buy materials to make fabric products to sell
17: [living]buy materials to make fabric products to sell
18: [living]buy materials to make fabric products to sell
19: [family]pay for school


In [135]:
lenders = pd.read_csv('data/lenders.csv')