In [36]:
import pandas as pd
import graphlab as gl
from random import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

In [2]:
# clean pairs data
def clean_pair_date(sf):
    sf.rename({'X1': 'lender_id', 'X2': 'loan_id'})
    sf['loan_id'] = sf['loan_id'].astype(str)
    sf['lender_id'] = sf['lender_id'].astype(str)
    return sf


# clean loan data
def clean_loan_data(df):
    # drop columns
    df = df.drop(['paid_date', 'planned_expiration_date', 'languages'], axis=1)

    # drop duplicates
    df = df.drop_duplicates('id')

    # drop duplicate header
    df = df[df['activity'] != 'activity']

    # drop nas
    df = df.dropna(subset=['earliest_scheduled_payment', 'last_scheduled_payment',
                           'repayment_interval', 'posted_date',
                           'status', 'repayment_term', 'use'], how='any')

    # fill paid_amount's na with zero
    df['paid_amount'] = df['paid_amount'].fillna(0)

    # fill genders
    df['gender'] = df['gender'].map(lambda x: 'M' if random() <= 0.39 else 'F')
    df['gender'] = df['gender'].map(lambda x: 1 if x == 'F' else 'M')

    # fill null descriptions with empty string
    df['descriptions'] = df['descriptions'].fillna(0)

    # binaralize bonus credit
    df['bonus_credit_eligibility'] = df['bonus_credit_eligibility'].map(lambda x: 1 if x == 'True' else 0)

    # convert some columns to datetime
    df['earliest_scheduled_payment'] = df['earliest_scheduled_payment'].map(
        lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%SZ'))
    df['last_scheduled_payment'] = df['last_scheduled_payment'].map(
        lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%SZ'))
    df['posted_date'] = df['posted_date'].map(
        lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%SZ'))

    # convert some columns to int
    df['lender_count'] = df['lender_count'].astype('int64')
    df['loan_amount'] = df['loan_amount'].astype('int64')
    df['paid_amount'] = df['paid_amount'].astype('float64').astype('int64')
    df['repayment_term'] = df['repayment_term'].astype('float64')

    return df


def drop_unexsiting_loan_ids(sf, df):
    loan_ids_in_pairs = sorted(list(sf['loan_id'].unique()))
    loan_ids_in_loans = sorted(list(df['id'].values))
    loan_ids_intersection = set(loan_ids_in_loans) & set(loan_ids_in_pairs)
    # drop useless loan_ids in sf
    sf['loan_id'] = sf['loan_id'].apply(lambda x: x if x in loan_ids_intersection else None)
    sf = sf.dropna('loan_id')
    # drop useless loan_ids in df
    df['id'] = df['id'].map(lambda x: x if x in loan_ids_intersection else None)
    df = df.dropna()
    return sf, df

In [27]:
df = pd.read_csv('data/loans.csv', delimiter=',')
df = clean_loan_data(df)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824505 entries, 0 to 842841
Data columns (total 21 columns):
activity                      824505 non-null object
bonus_credit_eligibility      824505 non-null int64
id                            824505 non-null object
lender_count                  824505 non-null int64
loan_amount                   824505 non-null int64
paid_amount                   824505 non-null int64
partner_id                    824505 non-null object
posted_date                   824505 non-null datetime64[ns]
sector                        824505 non-null object
status                        824505 non-null object
use                           824505 non-null object
gender                        824505 non-null object
family                        824505 non-null object
descriptions                  824505 non-null object
image_id                      824505 non-null object
image_template_id             824505 non-null object
country                       824505 

## Get features!!!!!!!
- bonus credit eligibility
- loan_amount
- posted_date =>
 - 4 seasons: dummy variable in {0, 1, 2, 3}
- Tfidf of use
- gender
- family
- country
- repayment term
- repayment interval => dummy variable in {0, 1, 2}

In [29]:
raw_features = df[['bonus_credit_eligibility', 'loan_amount', 'posted_date',
                   'use', 'gender', 'family', 'country', 'repayment_term',
                   'repayment_interval', 'id']]

In [34]:
# dummify posted_date to seasons
def convert_to_season(x):
    m = x.month
    if m < 4:
        return 'Spring'
    elif m < 7:
        return 'Summer'
    elif m < 11:
        return 'Fall'
    else:
        return 'Winter'

raw_features['season'] = raw_features['posted_date'].map(lambda x: convert_to_season(x))
raw_features = pd.concat([raw_features, pd.get_dummies(raw_features['season'], prefix='season_')], axis=1)
raw_features = raw_features.drop(['season'], axis=1)

In [51]:
# dummify repayment_interval
raw_features = pd.concat([raw_features,
                          pd.get_dummies(raw_features['repayment_interval'], prefix='repayment_interval_')],
                          axis=1)
raw_features = raw_features.drop(['repayment_interval'], axis=1)

In [37]:
# get tfidf of use
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    snowball = SnowballStemmer('english')
    return [snowball.stem(word) for word in word_tokenize(doc.lower())]


def get_vectorizer(descriptions, num_features=300):
    vect = TfidfVectorizer(max_features=num_features, stop_words='english', tokenizer=tokenize)
    return vect.fit(descriptions)


text = raw_features['use'].values
tfidf = pd.DataFrame(get_vectorizer(text).transform(text).toarray())
# raw_features = raw_features.drop(['use'], axis=1)

In [61]:
print tfidf.shape
print tfidf[tfidf.isnull()].shape
print tfidf[tfidf.notnull()].shape

(824505, 300)
(824505, 300)
(824505, 300)


In [63]:
tfidf.isnull()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [49]:
pd.concat([raw_features, tfidf], axis=1)

Unnamed: 0,bonus_credit_eligibility,loan_amount,posted_date,use,gender,family,country,repayment_term,repayment_interval,id,...,290,291,292,293,294,295,296,297,298,299
0,0,950,2006-12-26 23:15:55,Enlarge his corral and buy feed and veterinary...,M,0,Togo,18,Monthly,3054,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
1,0,500,2007-03-30 00:36:41,"agriculture, livestock",M,0,Tajikistan,8,Monthly,7150,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
2,0,250,2007-06-05 21:21:01,Buy merchandise,M,0,Mexico,5,Monthly,11246,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
3,0,75,2007-08-02 01:40:03,growing green dragon (fruit) plants,M,0,Vietnam,7,Monthly,15342,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
4,1,1000,2007-09-28 00:05:04,to purchase a used car to operate a taxi business,1,0,Cambodia,20,Monthly,19438,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
5,0,950,2007-11-13 02:20:05,To purchase more cylinders of gas,1,0,Ecuador,12,Monthly,23535,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
6,1,600,2007-12-15 17:05:05,"To purchase peanut seeds, rice seeds, and fert...",1,0,Cambodia,20,Monthly,27631,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
7,0,975,2008-01-17 17:55:07,invest in and expand their various businesses,1,1,Peru,6,Monthly,31727,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0
8,0,1075,2008-04-01 05:20:14,Continue investing in seafood at wholesale prices,M,0,Ecuador,13,Monthly,35823,...,0,0,0.000000,0,0,0.466199,0,0.000000,0,0
9,0,550,2008-03-08 19:20:14,Purchase agricultural products,1,0,Nicaragua,14,Monthly,39919,...,0,0,0.000000,0,0,0.000000,0,0.000000,0,0


In [55]:
raw_features['use'].isnull()

0

In [59]:
tfidf.iloc[842827:842830]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299


In [60]:
type(tfidf)

pandas.core.frame.DataFrame