In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import * 
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import csv
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../project/LC_data/LoanStats3a_securev1.csv')
print df['desc'].head(100)

0       Borrower added on 12/22/11 > I need to upgra...
1       Borrower added on 12/22/11 > I plan to use t...
2                                                   NaN
3       Borrower added on 12/21/11 > to pay for prop...
4       Borrower added on 12/21/11 > I plan on combi...
5                                                   NaN
6       Borrower added on 12/18/11 > I am planning o...
7       Borrower added on 12/16/11 > Downpayment for...
8       Borrower added on 12/21/11 > I own a small h...
9       Borrower added on 12/16/11 > I'm trying to b...
10      Borrower added on 12/15/11 > I had recived a...
11                                                  NaN
12      Borrower added on 12/15/11 > Plan to pay off...
13      Borrower added on 12/19/11 > I intend to pay...
14                                                  NaN
15                                                  NaN
16                                                  NaN
17      Borrower added on 12/15/11 > Payoff othe

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
desc_data = df[['desc', 'loan_status']]
def clean_text(x):
	if isinstance(x, basestring):
		x = x.strip() # drop the space at the begining and end
		x = re.sub(r'\d+', '', x) # delete the numbers
		x = re.sub(r'[^\w\s]',' ', x) # remove punctuation and replace with a space
		x = x.lower()
		if len(x.split()) == 0:
			# remove strings of only space
			x = np.nan
	return x


desc_data['desc'] = desc_data['desc'].apply(clean_text)
desc_data = desc_data.dropna()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
	stemmed = []
	for item in tokens:
		stemmed.append(stemmer.stem(item))
	return stemmed

def tokenize(text):
	tokens = word_tokenize(text)
	stems = stem_tokens(tokens, stemmer)
	filtered = [w for w in stems if not w in stopwords.words('english')]
	return filtered


In [5]:
bad_loan_desc = desc_data.desc.loc[desc_data['loan_status'].isin(['Charged Off', 'Default'])]
good_loan_desc = desc_data.desc.loc[desc_data['loan_status'].isin(['Fully Paid'])]

In [7]:
tfidf_bad = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
response_bad = tfidf_bad.fit_transform(bad_loan_desc)
bad_feature_names = tfidf_bad.get_feature_names()

tfidf_good = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
response_good = tfidf_good.fit_transform(good_loan_desc)
good_feature_names = tfidf_good.get_feature_names()

In [8]:
bad_tf_idf_sum = np.sum(response_bad, axis=0).tolist()[0] # get sum of scores of all bad loans text for each word
good_tf_idf_sum = np.sum(response_good, axis=0).tolist()[0] # get sum of scores of all good loans text for each word

bad_tf_idf_normalized = [float(i)/sum(bad_tf_idf_sum) for i in bad_tf_idf_sum]
good_tf_idf_normalized = [float(i)/sum(good_tf_idf_sum) for i in good_tf_idf_sum]


all_words = list(set(good_feature_names + bad_feature_names))

good_tf_idf_normalized += [0]*len(list(set(all_words)-set(good_feature_names)))
good_feature_names += list(set(all_words)-set(good_feature_names))
dict_good = dict(zip(good_feature_names, good_tf_idf_normalized))

bad_tf_idf_normalized += [0]*len(list(set(all_words)-set(bad_feature_names)))
bad_feature_names += list(set(all_words)-set(bad_feature_names))
dict_bad = dict(zip(bad_feature_names, bad_tf_idf_normalized))

In [9]:
# get absolute difference of the two
diff = {key: abs(dict_bad[key]-dict_good.get(key, 0)) for key in dict_bad.keys()}

sorted_diff = sorted(((v,k) for k, v in diff.iteritems()), reverse=True) # sort(descending)
desc_features = []
# get the top 70 distinct words
for i in sorted_diff[:70]:
	desc_features.append(str(i[1]))

print desc_features
# save the file

['busi', 'rate', 'ad', 'br', 'need', 'borrow', 'help', 'consolid', 'time', 'money', 'motorcycl', 'new', 'thank', 'payment', 'account', 'open', 'apr', 'purchas', 'year', 'save', 'minimum', 'small', 'loan', 'car', 'refin', 'plan', 'wed', 'better', 'medic', 'low', 'im', 'ring', 'equip', 'monthli', 'balanc', 'person', 'expand', 'make', 'colleg', 'posit', 'expens', 'kid', 'problem', 'cosolid', 'graduat', 'salari', 'financi', 'store', 'recent', 'restaur', 'lend', 'school', 'franchis', 'start', 'firm', 'job', 'invest', 'realli', 'half', 'consolod', 'histori', 'request', 'work', 'divorc', 'locat', 'room', 'late', 'finish', 'revolv', 'ga']


In [10]:
# save as csv
with open('desc_tfidf.csv', 'wb') as csv_file:
	writer = csv.writer(csv_file)
	for i in sorted_diff:
		writer.writerow(i)
