## We check if there has been changes in classification results using the summarized text
- We'd expect with Tf-IDF vectorization, if there's a reduction in the amount of text, there should be a decrease in the model accuracy.

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import re
import numpy as np
import pandas as pd

In [184]:
file = '../output/with_summarized_10_sent.csv''
df = pd.read_csv(file, usecols=['Complaint ID', 'Product', 'Issue', 'Consumer complaint narrative'])

## Machine Learning Models
Build a simple logistic regression model with Tf-IDF vectorization

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [105]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

### preprocess and tokenize using regex and spaCy

In [186]:
import re
special = re.compile(r'http\S+|www\S+|[^a-zA-Z ]+|xx+')
docs_orig = [' '.join(special.sub('', doc.lower()).split()) for doc in df['Consumer complaint narrative'].values]
docs_summ = [' '.join(special.sub('', doc.lower()).split()) for doc in df['summarized'].values]

In [187]:
tokenized_orig = []
for doc in nlp.pipe(docs_orig, disable=['tagger', 'parser', 'ner']):
    tokenized_orig.append(" ".join(token.lemma_.lower() for token in doc if not token.is_stop and not token.is_space \
            and not token.is_punct and not token.like_num))
    
tokenized_summ = []
for doc in nlp.pipe(docs_summ, disable=['tagger', 'parser', 'ner']):
    tokenized_summ.append(" ".join(token.lemma_.lower() for token in doc if not token.is_stop and not token.is_space \
            and not token.is_punct and not token.like_num))

In [188]:
del docs_orig
del docs_summ

In [189]:
df['orig'] = tokenized_orig
df['summ'] = tokenized_summ
df.dropna().to_csv('../data/with_summarized_tokenized_10_sent.csv', index=False)

### Now some ML
We don't predict the `other services` category because there are too few entries.

In [190]:
valid_targets = ['Debt collection', 'Mortgage', 'Student loan',
       'Payday loan, title loan, or personal loan',
       'Credit card or prepaid card', 'Consumer Loan',
       'Money transfer, virtual currency, or money service',
       'Credit reporting, credit repair services, or other personal consumer reports',
       'Bank account or service']  
valid_df = df[df.Product.isin(valid_targets)]
targets = sorted(valid_df['Product'].unique())

### Train-test split & vectorization

In [191]:
train_df, test_df = train_test_split(valid_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [192]:
X_train_orig, X_val_orig = train_df['orig'].values, val_df['orig'].values
X_train_summ, X_val_summ = train_df['summ'].values, val_df['summ'].values
y_train, y_val = train_df['Product'].values, val_df['Product'].values

In [203]:
X_train_orig[179]

'enter agreement mortgage refinance home interest rate go provide credit issue occur appraisal order appraiser send week late originally intend speak individual sell product inform result delay rate lock extension require fault incur cost pass new good faith estimate send confirm rate lock extension document indicate lender credit reduce cost rate lock extension immediately ask loan processor advise worry sign document sake time credit restore late date extension late require concern document send sign indicate pay time give explanation time send email attempt confirm original credit confirm phone respond email despite send communication email day close tell absolute well provide credit justification not well deal interest rate rise significantly fault party appraisal company fault justification hold bear unethical change term agreement minute base market change sign close deal true not well price month pass interest rate rise okay essentially lie steal customer fault case tell process

In [204]:
X_train_summ[179]

'speak individual sell product inform result delay rate lock extension require fault incur cost pass extension late require concern document send sign indicate pay time give explanation time day close tell absolute well provide credit justification not well deal interest rate rise significantly fault party appraisal company fault immediately ask loan processor advise worry sign document sake time credit restore late date new good faith estimate send confirm rate lock extension document indicate lender credit reduce cost rate lock extension sign close deal true not well price month pass interest rate rise okay essentially lie steal customer justification hold bear unethical change term agreement minute base market change enter agreement mortgage refinance home interest rate go provide credit fault case tell process go incur cost issue occur appraisal order appraiser send week late originally intend'

Original data

In [155]:
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=1000)
tfidf_vect.fit(X_train_orig)
X_train_orig_tfidf = tfidf_vect.transform(X_train_orig)
X_val_orig_tfidf = tfidf_vect.transform(X_val_orig)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Now for the summarised

In [205]:
tfidf_vect2 = TfidfVectorizer(analyzer='word', max_features=1000)
tfidf_vect2.fit(X_train_summ)
X_train_summ_tfidf = tfidf_vect2.transform(X_train_summ)
X_val_summ_tfidf = tfidf_vect2.transform(X_val_summ)

### Logistic Regression

In [158]:
from sklearn.linear_model import LogisticRegression

#### Model built from original data

In [159]:
lr_orig = LogisticRegression(C=1.0, max_iter=500, class_weight='balanced', multi_class='auto', solver='lbfgs', n_jobs=3)
lr_orig.fit(X_train_orig_tfidf, y_train)

pred_orig = lr_orig.predict(X_val_orig_tfidf)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=3, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [209]:
orig_report_df = pd.DataFrame(classification_report(y_val, pred_orig, output_dict=True)).T

Unnamed: 0,Bank account or service,Consumer Loan,Credit card or prepaid card,"Credit reporting, credit repair services, or other personal consumer reports",Debt collection,"Money transfer, virtual currency, or money service",Mortgage,"Payday loan, title loan, or personal loan",Student loan,accuracy,macro avg,weighted avg
precision,0.762457,0.474791,0.737965,0.903889,0.812405,0.534171,0.895101,0.285767,0.764322,0.787896,0.685652,0.813922
recall,0.780098,0.679923,0.785111,0.757324,0.771271,0.81581,0.902716,0.675109,0.884636,0.787896,0.783555,0.787896
f1-score,0.771177,0.559136,0.760808,0.824141,0.791304,0.645612,0.898892,0.401558,0.82009,0.787896,0.719191,0.795722
support,4884.0,2590.0,7576.0,23756.0,15267.0,1303.0,9169.0,1145.0,3710.0,0.787896,69400.0,69400.0


#### Model built from summaries only

In [206]:
lr_summ = LogisticRegression(C=1.0, max_iter=500, class_weight='balanced', multi_class='auto', 
                             solver='lbfgs', n_jobs=3)
lr_summ.fit(X_train_summ_tfidf, y_train)
pred_summ = lr_summ.predict(X_val_summ_tfidf)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=3, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [208]:
report_summ = pd.DataFrame.from_dict(classification_report(y_val, pred_summ, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
Bank account or service,0.752633,0.775594,0.763941,4884.0
Consumer Loan,0.452551,0.661004,0.537267,2590.0
Credit card or prepaid card,0.728854,0.774551,0.751008,7576.0
"Credit reporting, credit repair services, or other personal consumer reports",0.902238,0.751726,0.820134,23756.0
Debt collection,0.808282,0.767145,0.787176,15267.0
"Money transfer, virtual currency, or money service",0.521182,0.811972,0.634863,1303.0
Mortgage,0.889386,0.889192,0.889289,9169.0
"Payday loan, title loan, or personal loan",0.27849,0.663755,0.392359,1145.0
Student loan,0.750925,0.875202,0.808315,3710.0
accuracy,0.780346,0.780346,0.780346,0.780346


### Summary:
So there is a slight decrease in model performance, but not significant enough to warrant serious attention: accuracy drops from 79% to 78%, even though we are only taking the top 5 relevant sentences.

Save models

In [160]:
import pickle

models = {'lr_orig': lr, 'lr_summ': lr_summ, 'tfidf_orig': tfidf_vect, 'tfidf_summ': tfidf_vect2}

with open('../models/linear_regression/lr_5_sent.pickle', 'wb') as f:
    pickle.dump(models, f, pickle.HIGHEST_PROTOCOL)