<a href="https://colab.research.google.com/github/thamsuppp/FRED-Visualization/blob/master/DL_Percentile_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import re
import time
import math

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from tqdm import tqdm
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import sklearn

tqdm.pandas()

Last Updated 2 Dec

**Housekeeping**

1. Download tensorflow_gpu (to enable much quicker training)
2. Download eli5
3. Download scikit-learn==0.21.3 (to enable text highlighting visualization of the eli5 explanations) https://github.com/TeamHG-Memex/eli5/issues/361

**Workflow**

1. Preprocessing raw text data
2. Loading existing word embeddings to create embedding matrix
3. Train RNN model (GRU) to regress on percentiles
4. Evaluating Model (MSE)
5. Explainable Model Insights (contribution of each word to prediction)

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
train = pd.read_csv('drive/MyDrive/CIS520 Project/train.csv')
content = train['content'].tolist()

In [5]:
def preprocessing(content_list):
    
    processed_list = []
    
    for line in tqdm(content_list):
        tokens = word_tokenize(line)
        # Convert to lower case
        tokens = [w.lower() for w in tokens]
        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_list.append(words)
        
    return processed_list

In [6]:
# Preprocessing the words
train['processed_content'] = preprocessing(train['content'])

100%|██████████| 16772/16772 [01:18<00:00, 212.78it/s]


**Training Regression Model**


In [7]:
# Extract the embeddings from the stored file
# Embedding is size 111k (# words) x 100 (dimensions)
import os 

EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('drive/MyDrive/CIS520 Project', 'word2vec_train2.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [8]:
# Vectorize the text samples into 2D integer tensor
tokenizer_obj = Tokenizer()
# Fit the tokenizer on the text
tokenizer_obj.fit_on_texts(train['processed_content'])
# Generate the sequence of tokens
sequences = tokenizer_obj.texts_to_sequences(train['processed_content'])

# Get the max length of each article - 5587
max_length = max([len(s) for s in train['processed_content']])
# Get vocab size
vocab_size = len(tokenizer_obj.word_index) + 1

# Pad the sequences
review_pad = pad_sequences(sequences, maxlen = max_length)

word_index = tokenizer_obj.word_index

In [9]:
num_words = len(word_index) + 1
words_not_found = []
# Create the emedding matrix - map embeddings from word2vec model for each word and create matrix of word vectors
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words: # Least common words (don't care)
        continue
        
    embedding_vector = embeddings_index.get(word)
    
    if (embedding_vector is not None):
        # Assign the ith elmenet of the embedding matrix to the embedding of that word
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
        
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 43


In [14]:
embedding_matrix.shape

(111813, 100)

**Training DL Model**


In [11]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, LSTM, GRU, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import SGD

In [18]:
def RNN_Regression_Model():
    
    text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
    
    rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False, name = 'embedding')(text_sequence)
    
    # Embedding Dropout
    rnn_layer = SpatialDropout1D(0.25, name='EMBEDDING_DROPOUT')(rnn_layer)
    rnn_layer = GRU(units = 32, dropout = 0.2)(rnn_layer)
    output = Dense(1, name = 'output')(rnn_layer)
    
    model = Model(inputs = text_sequence, outputs = output)
    
    return model

In [19]:
model = RNN_Regression_Model()
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_sequence_input (InputLa [(None, 5587)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 5587, 100)         11181300  
_________________________________________________________________
EMBEDDING_DROPOUT (SpatialDr (None, 5587, 100)         0         
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12864     
_________________________________________________________________
output (Dense)               (None, 1)                 33        
Total params: 11,194,197
Trainable params: 12,897
Non-trainable params: 11,181,300
_________________________________________________________________


In [20]:
# Getting the y-variable (Quintile classification)

train['quintile'] = pd.cut(train['percentile'], [0, 0.2, 0.4, 0.6, 0.8, 1], labels = [1,2,3,4,5])
train['quintile'] = train['quintile'].astype(int)

# Split into train and validation set
VALIDATION_SPLIT = 0.2
dl_train, dl_val = train_test_split(train, test_size = VALIDATION_SPLIT, random_state = 42, stratify = train['quintile'])

train_indices = dl_train.index.tolist()
val_indices = dl_val.index.tolist()

# Get the training and validation data
X_train = review_pad[train_indices]
X_val = review_pad[val_indices]

# NUMERICAL y-variable now
y_train = dl_train['percentile'].to_numpy()
y_val = dl_val['percentile'].to_numpy()


print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_val: ', X_val.shape)
print('Shape of y_val: ', y_val.shape)

Shape of X_train:  (13417, 5587)
Shape of y_train:  (13417,)
Shape of X_val:  (3355, 5587)
Shape of y_val:  (3355,)


In [30]:
# Early stopping and model checkpoint
early_stopping = EarlyStopping(monitor = 'val_mean_absolute_error', patience = 10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    'drive/MyDrive/CIS520 Project/word2vec_gru_content_reg.h5', monitor='val_mean_absolute_error', verbose=0, save_best_only=True)

# Train the DL Model
model.compile(loss = 'mean_absolute_error', optimizer = 'adam', metrics = ['mean_absolute_error'])

model.fit(X_train, y_train, batch_size = 32, epochs = 50, validation_data = (X_val, y_val), verbose = 1,
         callbacks = [early_stopping, model_checkpoint])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50


<tensorflow.python.keras.callbacks.History at 0x7f6c482a3c50>

In [24]:
# Save model
model.save('drive/MyDrive/CIS520 Project/word2vec_gru_content_reg1')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: drive/MyDrive/CIS520 Project/word2vec_gru_content_reg1/assets


In [None]:
# Load model
model = keras.models.load_model('drive/MyDrive/CIS520 Project/word2vec_gru_content1')

**Evaluating the Model**

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# Predict on the validation data - returns (3355, 5) matrix of predicted classes
val_probs = model.predict(X_val)
# Predicted quintiles
val_preds = np.argmax(val_probs, axis = 1)

y_val_actual = np.argmax(y_val, axis = 1)

**Model Interpretability using ELI5**

(Needs to be installed first)


In [25]:
import eli5
from eli5.lime import TextExplainer

In [28]:
# Define the custom predict function - input is list of strings (documents) and return matrix of shape (n_samples, n_classes) with probability values


# Assumes you already fitted the tokenizer on the training data
def predict_complex(documents_list):

  # Generate the sequence of tokens
  sequences = tokenizer_obj.texts_to_sequences(documents_list)

  # Pad the sequences
  X = pad_sequences(sequences, maxlen = 5587)

  # Predict
  y_preds = model.predict([X], batch_size = 32, verbose = 0)

  # *** Convert this into a one-class classification of bottom 3 quintiles vs top 2 quintiles
  y_high = y_probs[:, 3:].sum(axis = 1)
  y_low = y_probs[:, 0:3].sum(axis = 1)

  y_out = np.vstack((y_low, y_high)).T
  return y_out


In [29]:
te = TextExplainer(random_state = 42)

doc = ' '.join(dl_val['processed_content'].iloc[1])
te.fit(doc, predict_complex)
te.explain_prediction()

ValueError: ignored

In [None]:
te.explain_weights(target_names = ['low', 'high'])

Weight?,Feature
+0.465,back
+0.228,penn
+0.213,arts sciences
+0.204,tweet
+0.199,said
+0.194,received
+0.189,emergency
+0.180,university
+0.173,pm
+0.171,operations
