In [None]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# Exploratory Data Analysis

## Data Overview

In [None]:
df=pd.read_csv('Consumer_complaints.csv')

In [None]:
df.head().T

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df=df.dropna(subset=['Consumer complaint narrative'])

In [None]:
df['Consumer complaint narrative'].value_counts().T

In [None]:
#df=df1.sample(frac=0.1,replace=True)

In [None]:
df.shape

In [None]:
df['Company'].value_counts(dropna=False).shape

In [None]:
df['Product'].value_counts(dropna=False)

In [None]:
df['Sub-issue'].value_counts().shape

In [None]:
df['Issue'].value_counts()

In [None]:
df['Company public response'].value_counts(dropna=False)

In [None]:
df['Company response to consumer'].value_counts(dropna=False)

In [None]:
df['Tags'].value_counts(dropna=False)

In [None]:
df['Date received'].max()

In [None]:
df['Date received'].min()

In [None]:
df['Submitted via'].value_counts(dropna=False)

In [None]:
df['Timely response?'].value_counts(dropna=False)

In [None]:
df['Consumer disputed?'].value_counts(dropna=False)

In [None]:
temp= pd.crosstab(df['Product'], df['Consumer disputed?'])

In [None]:
temp.plot(kind='bar',figsize=(8,6))## The disputed percentages are about same between 
###Consent and Consent Not "complaint narrative text".

In [None]:
temp1= pd.crosstab(df['Company response to consumer'], df['Consumer disputed?'])

In [None]:
temp1

In [None]:
temp1.plot(kind='bar',figsize=(8,6)) ###Most cases are fall in closed with explaination

In [None]:
temp3= pd.crosstab(df['Product'], df['Consumer disputed?'])

In [None]:
temp3.plot(kind='bar',figsize=(8,6))

In [None]:
##plt.hist(np.log(df['Company'].value_counts()))
##plt.xlabel(df['Company'].value_counts().index)

In [None]:
df['State'].value_counts().shape

In [None]:
df['Date received']=pd.DatetimeIndex(df['Date received'],format='%m/%d/%Y').date
df['Date sent to company']=pd.DatetimeIndex(df['Date sent to company'],format='%m/%d/%Y').date

In [None]:
df[df['Date received']!=df['Date sent to company']].shape

## Missing Value Handling

In [None]:
df[pd.isnull(df['Issue'])]

In [None]:
df['Sub-product'].fillna('Not Provided',inplace=True)
df['Sub-issue'].fillna('Not Provided',inplace=True)
df['Consumer complaint narrative'].fillna('None or Not Provided',inplace=True)
###Combine "company public missing value" with "Company chose not to provide"
df['Company public response'].fillna('Company chooses not to provide',inplace=True) 

###Combine missing value of "Issue" with "Other"
df['Issue'].fillna('Other',inplace=True) 

### Replace missing vlaues of 'Tags' with "'Unknown'
df['Tags'].fillna('Unknown',inplace=True) 

### Replace missing vlaues of 'Submitted via' with "'other'
df['Submitted via'].fillna('Other',inplace=True) 

###Combine missing value,other,and withdrawn of "Consumer consent provided? " 
###with Consumer consent not provided, since only users's complaints narrative will be provided
### with the type of Consumer consent provided
df['Consumer consent provided?'].fillna('Consent not provided',inplace=True) 
df['Consumer consent provided?']=df['Consumer consent provided?'].apply(lambda x: 
            'Consent not provided' if x=='Other' or x=='Consent withdrawn' else x)

In [None]:
### Fill missing 'State' info using valide zipcode.
from pyzipcode import ZipCodeDatabase
zip=ZipCodeDatabase()
for i in df[pd.isnull(df['State'])&pd.notnull(df['ZIP code'])].index:
    try:
        df['State'][i]=str(zip[df['ZIP code'][i]].state)
    except:
        continue

In [None]:
df[pd.isnull(df['State'])&pd.isnull(df['ZIP code'])].shape ###Still 4268 users has no state info

In [None]:
df['State'].fillna('Not provided',inplace=True)
df['ZIP code'].fillna('Not Provided',inplace=True)

In [None]:
df['Consumer consent provided?'].value_counts(dropna=False)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
print df.groupby(df['Consumer disputed?'])['Date received']

# Feature Engineering

## Creating label

In [None]:
replace={'Yes':True, 'No':False}

In [None]:
df['Consumer disputed?']= df['Consumer disputed?'].apply(lambda x: replace[x])

In [None]:
replace1={'Consent provided':True, 'Consent not provided':False}
df['Consumer consent provided?']= df['Consumer consent provided?'].apply(lambda x: replace1[x])

## Feature creating

In [None]:
##process time refers to days between the date CFPB received complaitns and the date 
##when complaints were sent to company on behal of comsume
df['Process time']=(df['Date sent to company']-df['Date received']).astype('timedelta64[D]').astype(int)

In [None]:
#Create bin variabe of process time
process_days_bin = [0,3, 10, 30, 290]
process_days_cut = pd.cut(df['Process time'], process_days_bin, right=True, include_lowest=True)
df['process_days_bin'] = process_days_cut

In [None]:
df['Timely response?'] = df['Timely response?'].apply(lambda x: replace[x])

In [None]:
company_complaitns_counts = df['Company'].value_counts()
df['company_complaint_counts'] = df['Company'].apply(lambda x: company_complaitns_counts[x])

In [None]:
# Create bin variable for counts of complaints of each company
company_user_bin = [1,1000, 2000, 3000, 3750]
company_cut = pd.cut(df['company_complaint_counts'], company_user_bin, right=True, include_lowest=True)
df['company_complaint_counts_bin'] = company_cut

In [None]:
##Build dummy variable for all selected category variables in the dataset
def get_dummy_table(data,column_names):
    df_new=DataFrame()
    for name in column_names:
        data[name].astype('category')
        df_dum=pd.get_dummies(data[name])
        df_new=pd.concat([df_new,df_dum], axis=1)
    return df_new

In [None]:
dummy_for_model=['Product', 'Sub-product','Issue','Sub-issue', 'Company public response','Tags',
                 'Submitted via','Company response to consumer','company_complaint_counts_bin','process_days_bin']

In [None]:
##Cancat the created dummy table with other selected feature to build final feature table
df_model= get_dummy_table(df,dummy_for_model)

In [None]:
df_model=pd.concat([df_model,df['Process time']],axis=1)

In [None]:
df_model=pd.concat([df_model,df['Consumer consent provided?']],axis=1)

In [None]:
df_model=pd.concat([df_model,df['Timely response?']],axis=1)

In [None]:
##Create features about complaint submitted time
df_model['Date_received_year'] = df['Date received'].apply(lambda x: x.year)
df_model['Date_received_month'] = df['Date received'].apply(lambda x: x.month)
df_model['Date_received_day'] = df['Date received'].apply(lambda x: x.day)

In [None]:
#Create features about 'Consumenr complaint narrative'

from string import punctuation, ascii_letters

def process_text_field(text):
    '''
    text: string
    OUTPUT: int, int, int, float (length, word count, uppercase_count_rate, punctuation_rate)
    '''
    length = len(text)
    word_count = 0
    last_char = False
    for c in text:
        if c in ascii_letters:
            if last_char==False:
                word_count += 1
                last_char=True
        else:
            last_char = False
    
    punct_count = 0
    uppercase_count = 0
    for c in text:
        if c in punctuation:
            punct_count += 1
        if c.isupper():
            uppercase_count += 1
    punctuation_rate = punct_count / float(length+1)
    uppercase_count_rate = uppercase_count / float(length+1)
    
    return length, word_count, uppercase_count_rate, punctuation_rate

def process_text_column(df, fieldname):
    length_list = []
    word_count_list = []
    punctuation_rate_list = []
    uppercase_count_rate_list=[]
    for row_ix in df.index:
        length, word_count, uppercase_count_rate, punctuation_rate = process_text_field(df[fieldname][row_ix])
        length_list.append(length)
        word_count_list.append(word_count)
        uppercase_count_rate_list.append(uppercase_count_rate)
        punctuation_rate_list.append(punctuation_rate)
    return length_list, word_count_list, uppercase_count_rate_list, punctuation_rate_list

In [None]:
length_list, word_count_list, uppercase_count_rate_list, punctuation_rate_list = process_text_column(df, 'Consumer complaint narrative')

In [None]:
df_model['complaint_length'] = length_list
df_model['complaint_wordcount'] = word_count_list
df_model['complaint_uppercaserate'] = uppercase_count_rate_list
df_model['complaint_punctuationrate'] = punctuation_rate_list

In [None]:
df_model.head().T

# Modeling

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
import sklearn.metrics as skm
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
from scipy import interp
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [None]:
stemmer = SnowballStemmer("english")

def stem_tokens(tokens, stemmer):
    stemmed=[]
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [None]:
X_word = df['Consumer complaint narrative'].values
y = df['Consumer disputed?'].values

In [None]:
X_train_word, X_test_word, y_train, y_test = train_test_split(X_word, y, test_size=0.20, random_state=67)

In [None]:
#vectorize and sclae train / test data, respeoctively.
vectorizer = TfidfVectorizer(stop_words='english',lowercase=False, min_df=0.001, max_df = 0.2,
                             max_features=5000)
words_matrix = vectorizer.fit_transform(X_word)
words_matrix_test = vectorizer.transform(X_test_word)
words_matrix_train = vectorizer.transform(X_train_word)

#scale= MaxAbsScaler().fit(words_matrix_train)
#X_train_word_scale=scale.transform(words_matrix_train)
#X_test_word_scale=scale.transform(words_matrix_test)

model = MultinomialNB()
model.fit(words_matrix_train, y_train)


#Combine MultinomialNB() model of 'Consumer complaint narrative' prediction probability with other features
df_model['bayes_prob_narrative']= model.predict_proba(words_matrix)[:,1]

##Model evaluation via 'Consumer complaint narrative' only

model.score(words_matrix_test,y_test)
skm.roc_auc_score(y_test, model.predict_proba(words_matrix_test)[:, 1])
#skm.recall_score(y_test, model1.predict(X_test_word_scale))



In [None]:
#Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
#nmf topic extraction
aucls=[]
for n in [20,100,500,1000]:
    nmf = NMF(n_components = n, random_state=1, alpha=.1, l1_ratio=.5).fit(words_matrix)
    X_nmf = nmf.transform(words_matrix)
    X_train_nmf, X_test_nmf, y_train, y_test = train_test_split(X_nmf, y, test_size=0.20, random_state=67)
    rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='auto')
    rfc.fit(X_train_nmf, y_train)
    auc = skm.roc_auc_score(y_test, rfc.predict_proba(X_test_nmf)[:, 1])
    aucls.append(auc)
aucls

In [None]:
#tfidf_feature_names = vectorizer.get_feature_names()
#print_top_words(nmf, tfidf_feature_names, n_top_words = 20)

In [None]:
nmf.transform.shape

In [None]:
X_train_nmf, X_test_nmf, y_train, y_test = train_test_split(X_nmf, y, test_size=0.20, random_state=67)

In [None]:
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='auto')
rfc.fit(X_train_nmf, y_train)

In [None]:
rfc.score(X_test_nmf, y_test)

In [None]:
skm.roc_auc_score(y_test, rfc.predict_proba(X_test_nmf)[:, 1])

In [None]:
#LatentDirichletAllocation topic extration
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(X_word)

In [None]:
lda = LatentDirichletAllocation(n_topics=10, max_iter=20,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

In [None]:
lda.fit(tf)

In [None]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words = 20)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

In [None]:
X = df_model.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=67)

In [None]:
def roc_curve(probabilities, labels):
    '''
    INPUT: numpy array, numpy array
    OUTPUT: list, list, list

    Take a numpy array of the predicted probabilities and a numpy array of the
    true labels.
    Return the True Positive Rates, False Positive Rates and Thresholds for the
    ROC curve.
    '''

    thresholds = np.sort(probabilities)

    tprs = []
    fprs = []

    num_positive_cases = sum(labels)
    num_negative_cases = len(labels) - num_positive_cases

    for threshold in thresholds:
        # With this threshold, give the prediction of each instance
        predicted_positive = probabilities >= threshold
        # Calculate the number of correctly predicted positive cases
        true_positives = np.sum(predicted_positive * labels)
        # Calculate the number of incorrectly predicted positive cases
        false_positives = np.sum(predicted_positive) - true_positives
        # Calculate the True Positive Rate
        tpr = true_positives / float(num_positive_cases)
        # Calculate the False Positive Rate
        fpr = false_positives / float(num_negative_cases)

        fprs.append(fpr)
        tprs.append(tpr)
    
    return tprs, fprs, thresholds.tolist()

def plot_roc(probs, y_true, title, xlabel, ylabel):
    # ROC
    tpr, fpr, thresholds = roc_curve(v_probs, y_test)

    plt.hold(True)
    plt.plot(fpr, tpr)

    # 45 degree line
    xx = np.linspace(0, 1.0, 20)
    plt.plot(xx, xx, color='red')

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    plt.show()

## First Logistic Regression Model

In [None]:
lr = LogisticRegression(class_weight='auto')
lr.fit(X_train, y_train)

In [None]:
lr.score(X_test,y_test)

In [None]:
v_probs = lr.predict_proba(X_test)[:, 1]

In [None]:
plot_roc(v_probs, y_test, "ROC plot of  complaint dispute", 
         "False Positive Rate (1 - Specificity)", "True Positive Rate (Sensitivity, Recall)")

In [None]:
import sklearn.metrics as skm
skm.roc_auc_score(y_test, v_probs)

In [None]:
skm.recall_score(y_test, lr.predict(X_test))

In [None]:
pd.crosstab(y_test, lr.predict(X_test))

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(model, X_test, y_true):
    cm = confusion_matrix(y_true, model.predict(X_test))

    print(cm)

    # Show confusion matrix in a separate window
    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
plot_confusion_matrix(lr, X_test, y_test)

In [None]:
np.argsort(lr.coef_)[::-1]

In [None]:
sub_column = df_model.columns[np.argsort(lr.coef_)[::-1]][0][:245]

In [None]:
sub_column.shape

In [None]:
X_sub_column = df_model[sub_column]
X_sub_train, X_sub_test, y_train, y_test = train_test_split(X_sub_column, y, test_size=0.20, random_state=67)

In [None]:
lr_sub = LogisticRegression(class_weight='auto')
lr_sub.fit(X_sub_train, y_train)

In [None]:
lr_sub.score(X_sub_test,y_test)

In [None]:
skm.roc_auc_score(y_test,lr_sub.predict_proba(X_sub_test)[:, 1])

## Now try descision tree model

In [None]:
#Gradiend Boosting Classifier
#from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#gbc = GradientBoostingClassifier(n_estimators=1000, max_depth=8, subsample=0.5, 
                             #   max_features='auto', learning_rate=0.01)
#gbc.fit(X_train, y_train)

In [None]:
#gbc.score(X_test,y_test)

In [None]:
#pd.crosstab(y_test, gbc.predict(X_test))

In [None]:
#plot_confusion_matrix(gbc, X_test, y_test)

In [None]:
#skm.roc_auc_score(y_test, gbc.predict_proba(X_test)[:, 1])

In [None]:
## Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=3000, n_jobs=-1, class_weight='auto')
rfc.fit(X_train, y_train)

In [None]:
rfc.score(X_test, y_test)

In [None]:
pd.crosstab(y_test, rfc.predict(X_test))

In [None]:
skm.roc_auc_score(y_test, rfc.predict_proba(X_test)[:, 1])

In [None]:
skm.recall_score(y_test, rfc.predict(X_test))

In [None]:
plot_confusion_matrix(rfc, X_test, y_test)

In [None]:
def plot_importance(clf, X, max_features=10):
    '''Plot feature importance'''
    feature_importance = clf.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    # Show only top features
    pos = pos[-max_features:]
    feature_importance = (feature_importance[sorted_idx])[-max_features:]
    feature_names = (X.columns[sorted_idx])[-max_features:]
    
    plt.barh(pos, feature_importance, align='center')
    plt.yticks(pos, feature_names)
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')

In [None]:
plot_importance(rfc, df_model, max_features=20)