In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(font_scale=1.3)

## First Classification Model
- Classify the different 'Product' categories

In [None]:
file = '../data/with_cleaned.csv'
df = pd.read_csv(file)

In [None]:
df['Product'].value_counts()

In [None]:
valid_set = ['CR', 'DC', 'MO', 'CC', 'BS', 'SL']
df = df[df['Abbrev'].isin(valid_set)].dropna(subset=['cleaned_text'])
df = df[df['cleaned_text'].str.len() >= 10]

In [None]:
model_df = df[['Product','Issue', 'Complaint ID', 'cleaned_text', 'Abbrev']]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
train_df, test_df = train_test_split(model_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
X_train, X_val = train_df['cleaned_text'].values, val_df['cleaned_text'].values
y_train, y_val = train_df['Abbrev'].values, val_df['Abbrev'].values

In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 1))
tfidf_vect.fit(X_train)

In [None]:
X_train_tfidf = tfidf_vect.transform(X_train)
X_val_tfidf = tfidf_vect.transform(X_val)

### Logistic regression

In [None]:
lr = LogisticRegression(C=1.0, max_iter=500, class_weight='balanced', multi_class='auto', solver='lbfgs', n_jobs=3)
lr.fit(X_train_tfidf, y_train)

### Random Forest classifier - Needs hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=10, n_jobs=3, max_depth=10, max_features=10)
rf.fit(X_train_tfidf, y_train)

In [None]:
model = lr
targets = sorted(val_df['Abbrev'].unique())

### Results on the validation data

In [None]:
pred = model.predict(X_val_tfidf)

val_df['pred'] = pred
pred_proba = model.predict_proba(X_val_tfidf)
proba_df = pd.DataFrame(pred_proba, columns=['pred_'+t for t in targets])
proba_df['max_prob'] = proba_df.max(axis=1)
proba_df['Complaint ID'] = val_df['Complaint ID'].values

merged = pd.merge(val_df, proba_df, on='Complaint ID')
threshold = merged[merged['max_prob']> 0]

### Generate Test output

In [None]:
X_test_tfidf = tfidf_vect.transform(test_df['cleaned_text'].values)
test_pred = model.predict(X_test_tfidf)
test_df['pred'] = test_pred
pred_proba = model.predict_proba(X_test_tfidf)
proba_df = pd.DataFrame(pred_proba, columns=['pred_'+t for t in targets])
proba_df['max_prob'] = proba_df.max(axis=1)
proba_df['Complaint ID'] = test_df['Complaint ID'].values
merged = pd.merge(test_df, proba_df, on='Complaint ID')

threshold = merged[merged['max_prob']> 0].drop(columns=['cleaned_text'])
threshold.to_csv('../output/predictions.csv', index=False)

In [None]:
print(classification_report(test_df['Abbrev'], test_pred))

### Look at some wrong predictions

In [None]:
wrong_pl = threshold[(threshold['Abbrev']=='CC') & (threshold['pred'] != 'CC')]

In [None]:
wrong_pl['pred'].hist()

In [None]:
threshold_big = pd.merge(threshold, df, on=['Complaint ID'])
for row in threshold_big[(threshold_big['Abbrev_x']=='CC') & (threshold_big['pred'] == 'BS')].iterrows():
    print(row[0])
    print()
    print('Cleaned text: ', row[1]['Consumer complaint narrative'])
    print()
    print('Issue: ', row[1]['Issue_x'])
    print('Actual type: {}, Predicted type: {}'.format(row[1]['Abbrev_x'], row[1]['pred']))
    print()
    print()