In [None]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random
%matplotlib inline

# Exploratory Data Analysis

## Data Overview

In [None]:
df = pd.read_csv('Consumer_complaints.csv')

In [None]:
df.columns

In [None]:
df=df.sample(frac=0.01,replace=True)

In [None]:
bank=pd.read_csv('bank_rank.csv')

In [None]:
df['Company'].value_counts

In [None]:
df.shape

In [None]:
bank.info()

In [None]:
df=df.merge(bank,on='Company',how='left')

In [None]:
df.Company

In [None]:
df['rank'].value_counts(dropna=True)

In [None]:
df.info()

In [None]:
df['Issue'].value_counts(dropna=False).shape

In [None]:
df['Product'].value_counts(dropna=False)

In [None]:
df['Sub-issue'].value_counts()

In [None]:
df['Issue'].value_counts()

In [None]:
df['Company public response'].value_counts(dropna=False)

In [None]:
df['Company response to consumer'].value_counts(dropna=False)

In [None]:
df['Tags'].value_counts(dropna=False)

In [None]:
df['Date received'].max()

In [None]:
df['Date received'].min()

In [None]:
df['Submitted via'].value_counts(dropna=False)

In [None]:
df['Timely response?'].value_counts(dropna=False)

In [None]:
df['Consumer disputed?'].value_counts(dropna=False)

In [None]:
temp= pd.crosstab(df['Company response to consumer'], df['Consumer disputed?'])

In [None]:
temp.plot(kind='bar',figsize=(8,6))## The disputed percentages are about same between 
###Consent and Consent Not "complaint narrative text".

In [None]:
temp1= pd.crosstab(df['Company response to consumer'], df['Consumer disputed?'])

In [None]:
temp1

In [None]:
temp1.plot(kind='bar',figsize=(8,6)) ###Most cases are fall in closed with explaination

In [None]:
temp3= pd.crosstab(df['Product'], df['Consumer disputed?'])

In [None]:
temp3.plot(kind='bar',figsize=(8,6))

In [None]:
##plt.hist(np.log(df['Company'].value_counts()))
##plt.xlabel(df['Company'].value_counts().index)

In [None]:
df['State'].value_counts().shape

In [None]:
df['Date received']=pd.DatetimeIndex(df['Date received'],format='%m/%d/%Y').date
df['Date sent to company']=pd.DatetimeIndex(df['Date sent to company'],format='%m/%d/%Y').date

In [None]:
df[df['Date received']!=df['Date sent to company']].shape

## Missing Value Handling

In [None]:
df[pd.isnull(df['Issue'])]

In [None]:
df['Sub-product'].fillna('Not Provided',inplace=True)
df['Sub-issue'].fillna('Not Provided',inplace=True)
df['Consumer complaint narrative'].fillna('None or Not Provided',inplace=True)
###Combine "company public missing value" with "Company chose not to provide"
df['Company public response'].fillna('Company chooses not to provide',inplace=True) 

###Combine missing value of "Issue" with "Other"
df['Issue'].fillna('Other',inplace=True) 

### Replace missing vlaues of 'Tags' with "'Unknown'
df['Tags'].fillna('Unknown',inplace=True) 

### Replace missing vlaues of 'Submitted via' with "'other'
df['Submitted via'].fillna('Other',inplace=True) 

###Combine missing value,other,and withdrawn of "Consumer consent provided? " 
###with Consumer consent not provided, since only users's complaints narrative will be provided
### with the type of Consumer consent provided
#df['Consumer consent provided?'].fillna('Consent not provided',inplace=True) 
#df['Consumer consent provided?']=df['Consumer consent provided?'].apply(lambda x: 
            #'Consent not provided' if x=='Other' or x=='Consent withdrawn' else x)
df['Consumer consent provided?'].fillna('Unknown',inplace=True) 

In [None]:
### Fill missing 'State' info using valide zipcode.
from pyzipcode import ZipCodeDatabase
zip=ZipCodeDatabase()
for i in df[pd.isnull(df['State'])&pd.notnull(df['ZIP code'])].index:
    try:
        df['State'][i]=str(zip[df['ZIP code'][i]].state)
    except:
        continue

In [None]:
df[pd.isnull(df['State'])&pd.isnull(df['ZIP code'])].shape ###Still 4268 users has no state info

In [None]:
df['State'].fillna('Not provided',inplace=True)
df['ZIP code'].fillna('Not Provided',inplace=True)

In [None]:
df['Consumer consent provided?'].value_counts(dropna=False)

In [None]:
df.info()

In [None]:
df.head()

# Feature Engineering

## Creating label

In [None]:
replace={'Yes':0, 'No':1}

In [None]:
df['Consumer disputed?']= df['Consumer disputed?'].apply(lambda x: replace[x])

In [None]:
#replace1={'Consent provided':True, 'Consent not provided':False}
#f['Consumer consent provided?']= df['Consumer consent provided?'].apply(lambda x: replace1[x])

## Feature creating

In [None]:
##process time refers to days between the date CFPB received complaitns and the date 
##when complaints were sent to company on behal of comsume
df['Process time']=(df['Date sent to company']-df['Date received']).astype('timedelta64[D]').astype(int)

In [None]:
df['Process time'].groupby(df['Consumer disputed?']).mean()

In [None]:
df['Timely response?'].value_counts()

In [None]:
df['Timely response?']= df['Timely response?'].apply(lambda x: replace[x])

In [None]:
dummy_for_model=['Product','Sub-product','Issue','Sub-issue', 'Company public response','Tags',
                 'Submitted via','State','Consumer consent provided?','Timely response?']

In [None]:
##Build dummy variable for all selected category variables in the dataset
def get_dummy_table(data,column_names):
    df_new=DataFrame()
    for name in column_names:
        data[name].astype('category')
        df_dum=pd.get_dummies(data[name])
        df_new=pd.concat([df_new,df_dum], axis=1)
    return df_new

In [None]:
##Cancat the created dummy table with other selected feature to build final feature table
df_model= get_dummy_table(df,dummy_for_model)

In [None]:
#df_model=pd.concat([df_model,df['Process time']],axis=1)

In [None]:
#df_model=pd.concat([df_model,df['Consumer consent provided?']],axis=1)

In [None]:
#df_model=pd.concat([df_model,df['Timely response?']],axis=1)

In [None]:
df_model['Date_received_year'] = df['Date received'].apply(lambda x: x.year)

In [None]:
df_model['Date_received_month'] = df['Date received'].apply(lambda x: x.month)

In [None]:
df_model['Date_received_day'] = df['Date received'].apply(lambda x: x.day)


In [None]:
df_model.head()

# Modeling

In [None]:
X=df_model

In [None]:
y=df['Consumer disputed?']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

In [None]:
def roc_curve(probabilities, labels):
    '''
    INPUT: numpy array, numpy array
    OUTPUT: list, list, list

    Take a numpy array of the predicted probabilities and a numpy array of the
    true labels.
    Return the True Positive Rates, False Positive Rates and Thresholds for the
    ROC curve.
    '''

    thresholds = np.sort(probabilities)

    tprs = []
    fprs = []

    num_positive_cases = sum(labels)
    num_negative_cases = len(labels) - num_positive_cases

    for threshold in thresholds:
        # With this threshold, give the prediction of each instance
        predicted_positive = probabilities >= threshold
        # Calculate the number of correctly predicted positive cases
        true_positives = np.sum(predicted_positive * labels)
        # Calculate the number of incorrectly predicted positive cases
        false_positives = np.sum(predicted_positive) - true_positives
        # Calculate the True Positive Rate
        tpr = true_positives / float(num_positive_cases)
        # Calculate the False Positive Rate
        fpr = false_positives / float(num_negative_cases)

        fprs.append(fpr)
        tprs.append(tpr)
    
    return tprs, fprs, thresholds.tolist()

def plot_roc(probs, y_true, title, xlabel, ylabel):
    # ROC
    tpr, fpr, thresholds = roc_curve(v_probs, y_test)

    plt.hold(True)
    plt.plot(fpr, tpr)

    # 45 degree line
    xx = np.linspace(0, 1.0, 20)
    plt.plot(xx, xx, color='red')

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    plt.show()

## First Logistic Regression Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=67)

In [None]:
from sklearn.preprocessing import MaxAbsScaler
scale=MaxAbsScaler()

In [None]:
X_train_scale=scale.fit_transform(X_train)
X_test_scale=scale.transform(X_test)

In [None]:
lr = LogisticRegression(class_weight='auto')
lr.fit(X_train_scale, y_train)

In [None]:
lr.score(X_test_scale,y_test)

In [None]:
v_probs = lr.predict_proba(X_test_scale)[:, 1]

In [None]:
plot_roc(v_probs, y_test, "ROC plot of  complaint dispute", 
         "False Positive Rate (1 - Specificity)", "True Positive Rate (Sensitivity, Recall)")

In [None]:
import sklearn.metrics as skm
skm.roc_auc_score(y_test, v_probs)

In [None]:
pd.crosstab(y_test, lr.predict(X_test_scale))

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(model, X_test, y_true):
    cm = confusion_matrix(y_true, model.predict(X_test))

    print(cm)

    # Show confusion matrix in a separate window
    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
plot_confusion_matrix(lr, X_test_scale, y_test)

## Now try descision tree model

In [None]:
##Gradiend Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=2000, max_depth=4, subsample=0.5, 
                                 max_features='auto', learning_rate=0.01)
gbc.fit(X_train_scale, y_train)

In [None]:
gbc.score(X_test_scale,y_test)

In [None]:
pd.crosstab(y_test, gbc.predict(X_test_scale))

In [None]:
plot_confusion_matrix(gbc, X_test_scale, y_test)

In [None]:
skm.roc_auc_score(y_test, gbc.predict_proba(X_test_scale)[:, 1])

In [None]:
def plot_importance(clf, X, max_features=10):
    '''Plot feature importance'''
    feature_importance = clf.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    # Show only top features
    pos = pos[-max_features:]
    feature_importance = (feature_importance[sorted_idx])[-max_features:]
    feature_names = (X.columns[sorted_idx])[-max_features:]
    
    plt.barh(pos, feature_importance, align='center')
    plt.yticks(pos, feature_names)
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')

In [None]:
plot_importance(gbc, X, max_features=20)

In [None]:
## Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='auto')
rfc.fit(X_train_scale, y_train)

In [None]:
rfc.score(X_test_scale, y_test)

In [None]:
pd.crosstab(y_test, rfc.predict(X_test_scale))

In [None]:
skm.roc_auc_score(y_test, rfc.predict_proba(X_test_scale)[:, 1])

In [None]:
plot_confusion_matrix(rfc, X_test_scale, y_test)

In [None]:
#from sklearn.ensemble import AdaBoostClassifier
#from sklearn.tree import DecisionTreeClassifier

In [None]:
#adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                         algorithm="SAMME",
                         n_estimators=5000)

#adb.fit(X_train, y_train)

In [None]:
#adb.score(X_test, y_test)

In [None]:
#skm.roc_auc_score(y_test, adb.predict_proba(X_test)[:, 1])

In [None]:
#from sklearn.svm import SVC
#svc = SVC()
#svc.fit(X_train, y_train) 

In [None]:
#svc.score(X_test, y_test)

In [None]:
#skm.roc_auc_score(y_test, svc.predict_proba(X_test)[:, 1])

## Optimize the parameter by GridSearchCV

Gradient Boosting GridSearch

In [None]:
#gbc_grid = {'learning_rate': [0.05, 0.01],'max_depth': [3, 8],
  # 'n_estimators': [500, 1000],'subsample': [0.5, 0.75, 1.0]}


In [None]:
#gbc_grid_cv = GridSearchCV(GradientBoostingClassifier(), gbc_grid, n_jobs=-1)

In [None]:
#gbc_grid_cv.fit(X_train, y_train)

In [None]:
#best_model = gbc_grid_cv.best_estimator_

In [None]:
#best_params = gbc_grid_cv.best_params_
#best_params

In [None]:
#gbc_grid_cv.best_score_

In [None]:
#skm.roc_auc_score(y_test, predict_proba(X_test)[:, 1])

In [None]:
#def plot_importance(clf, X, max_features=10):
    #'''Plot feature importance'''
    #feature_importance = clf.feature_importances_
    # make importances relative to max importance
    #feature_importance = 100.0 * (feature_importance / feature_importance.max())
    #sorted_idx = np.argsort(feature_importance)
    #pos = np.arange(sorted_idx.shape[0]) + .5
    
    # Show only top features
    #pos = pos[-max_features:]
   # feature_importance = (feature_importance[sorted_idx])[-max_features:]
   # feature_names = (X.columns[sorted_idx])[-max_features:]
    
   # plt.barh(pos, feature_importance, align='center')
    #plt.yticks(pos, feature_names)
    #plt.xlabel('Relative Importance')
    #plt.title('Variable Importance')

In [None]:
plot_importance(best_model, X_train, max_features=16)

In [None]:
#def plot_loss(clf, params):
    '''Plot training deviance.  Stolen from sklearn documentation'''    
    # compute test set deviance
   # test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

    #for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
       # test_score[i] = clf.loss_(y_test, y_pred)

    #plt.title('Deviance')
    #plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
             #label='Training Set Deviance')
    #plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             #label='Test Set Deviance')
    #plt.legend(loc='upper right')
    #plt.xlabel('Boosting Iterations')
   # plt.ylabel(clf.loss)

In [None]:
#plot_loss(best_model, best_params)

In [None]:
#plot_confusion_matrix(best_model, X_test, y_test)

Random Forest Grid Search

In [None]:
#rf_grid = {'max_depth': [4, 8, None],'max_features': ['sqrt', 'log2', None],'min_samples_split': [1, 2, 4],
    #'min_samples_leaf': [1, 2, 4],'bootstrap': [True], # Mandatory with oob_score=True,
           #'n_estimators': [50, 100, 200, 400],'random_state': [67],'oob_score': [True],'n_jobs': [-1] }

In [None]:
#rf_grid_cv = GridSearchCV(RandomForestClassifier(),rf_grid,n_jobs=-1,verbose=True,scoring='roc_auc')

In [None]:
#rf_grid_cv.fit(X_train, y_train)

In [None]:
#rf_grid_cv.best_params_

In [None]:
#rf_grid_cv.best_score_

In [None]:
#best_model = rf_grid_cv.best_estimator_

In [None]:
#best_model.oob_score_

In [None]:
#skm.roc_auc_score(y_test, best_model.predict(X_test))

In [None]:
#plot_confusion_matrix(best_model, X_test, y_test)