In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.3)

In [37]:
file = '../data/debt_only.csv'
df = pd.read_csv(file)

In [38]:
model_df = df[['Complaint ID', 'tokenized_text', 'Issue']].dropna()
model_df = model_df[model_df['tokenized_text'].str.len() >= 10]

In [39]:
model_df.Issue.value_counts()

Attempts to collect debt not owed                                23867
Cont'd attempts collect debt not owed                            17376
Communication tactics                                            12600
Written notification about debt                                  10689
False statements or representation                                9510
Disclosure verification of debt                                   7580
Took or threatened to take negative or legal action               6611
Taking/threatening an illegal action                              2934
Improper contact or sharing of info                               2910
Threatened to contact someone or share information improperly     1702
Name: Issue, dtype: int64

In [40]:
abbrev_map = {
    'Attempts to collect debt not owed' : 'DNO',
    'Communication tactics': 'CT',
    "Cont'd attempts collect debt not owed": 'CDNO',
    "Disclosure verification of debt": 'DV',
    "False statements or representation": 'FS',
    "Improper contact or sharing of info": 'IC',
    "Taking/threatening an illegal action": 'TIA',
    "Threatened to contact someone or share information improperly": 'IC',
    "Took or threatened to take negative or legal action": 'TNA',
    "Written notification about debt": 'WN'
}
model_df['target'] = model_df['Issue'].apply(lambda i: abbrev_map[i])

In [41]:
valid_types = ['DNO', 'CT', 'WN', 'FS', 'DV', 'TNA']
model_df = model_df[model_df['target'].isin(valid_types)]
targets = sorted(model_df['target'].unique())

In [75]:
model_df['target'].value_counts() / len(model_df)

DNO    0.336833
CT     0.177823
WN     0.150853
FS     0.134214
DV     0.106976
TNA    0.093301
Name: target, dtype: float64

In [46]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, SpatialDropout1D, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

In [47]:
MAX_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(model_df['tokenized_text'].values)
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 44091 unique tokens.


In [49]:
X = tokenizer.texts_to_sequences(model_df['tokenized_text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor: ', X.shape)

Shape of data tensor:  (70857, 250)


In [52]:
y = pd.get_dummies(model_df['target']).values
print('Shape of label tensor: ', y.shape)

Shape of label tensor:  (70857, 6)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.10, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(63771, 250) (63771, 6)
(7086, 250) (7086, 6)


In [55]:
from keras.layers import Embedding

In [59]:
model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 606       
Total params: 5,081,006
Trainable params: 5,081,006
Non-trainable params: 0
_________________________________________________________________
None


In [60]:
epochs = 5
batch_size = 64

history = model.fit(X_train, y_train, 
                    epochs=epochs, batch_size=batch_size,validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Train on 57393 samples, validate on 6378 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
pred = model.predict(X_test)

In [68]:
y_true = [targets[i] for i in np.argmax(y_test, axis=1)]

In [72]:
y_pred = [targets[i] for i in np.argmax(pred, axis=1)]

In [73]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          CT       0.70      0.72      0.71      1187
         DNO       0.60      0.71      0.65      2462
          DV       0.43      0.31      0.36       760
          FS       0.50      0.22      0.30       995
         TNA       0.35      0.46      0.40       657
          WN       0.43      0.47      0.45      1025

    accuracy                           0.54      7086
   macro avg       0.50      0.48      0.48      7086
weighted avg       0.54      0.54      0.53      7086

