In [13]:
import numpy as np # linear algebra
import pandas as pd

from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM, MaxPooling1D, Flatten, SpatialDropout1D, Conv1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.optimizers import SGD
import spacy
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
spacy.require_gpu()


True

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
processed_directory = '../processed_data/'
train_df = pd.read_json('../processed_data/train.json')
test_df = pd.read_json('../processed_data/test.json')

In [4]:
train_df

Unnamed: 0,id,text,rating,label
0,0,Bromwell High is a cartoon comedy. It ran at t...,9,+
1,10000,Homelessness (or Houselessness as George Carli...,8,+
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,+
3,10002,This is easily the most underrated film inn th...,7,+
4,10003,This is not the typical Mel Brooks film. It wa...,8,+
...,...,...,...,...
24995,9998,"Towards the end of the movie, I felt it was to...",4,-
24996,9999,This is the kind of movie that my enemies cont...,3,-
24997,999,I saw 'Descent' last night at the Stockholm Fi...,3,-
24998,99,Some films that you pick up for a pound turn o...,1,-


In [5]:
def clean_text(df):
    text_list = df['text'].to_list()
    x = []
    for doc in tqdm_notebook(nlp.pipe(text_list, disable=["parser", "lemmatizer", 'ner']),total=len(text_list) ):
        output = [token.text for token in doc if not (token.is_stop or token.pos_ == 'PUNCT')]
        x.append(' '.join(output))
    df['text'] = pd.Series(x)

def get_features(df, tokenizer):
    x = tokenizer.texts_to_sequences(df['text'].values)
    x = pad_sequences(x)
    y = df['label']
    y = pd.get_dummies(y)
    return x, y

In [6]:
clean_text(train_df)
clean_text(test_df)

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [7]:
train_df

Unnamed: 0,id,text,rating,label
0,0,Bromwell High cartoon comedy ran time programs...,9,+
1,10000,Homelessness Houselessness George Carlin state...,8,+
2,10001,Brilliant acting Lesley Ann Warren Best dramat...,10,+
3,10002,easily underrated film inn Brooks cannon Sure ...,7,+
4,10003,typical Mel Brooks film slapstick movies actua...,8,+
...,...,...,...,...
24995,9998,end movie felt technical felt like classroom w...,4,-
24996,9999,kind movie enemies content watch time bloody t...,3,-
24997,999,saw Descent night Stockholm Film Festival huge...,3,-
24998,99,films pick pound turn good 23rd Century films ...,1,-


In [8]:
max_features = 1000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(train_df['text'].values)

In [9]:
train_x, train_y = get_features(train_df, tokenizer)
test_x, test_y = get_features(test_df, tokenizer)

In [10]:
train_x.shape

(25000, 442)

In [19]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = train_x.shape[1]))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.3))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(2,activation='sigmoid'))
model.summary()

callbacks=[
    ModelCheckpoint(filepath='./models_lstm/checkpoint',
                   save_best_only=True, verbose=1)
]

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 442, 128)          128000    
                                                                 
 conv1d_2 (Conv1D)           (None, 435, 32)           32800     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 217, 32)          0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 6944)              0         
                                                                 
 dropout (Dropout)           (None, 6944)              0         
                                                                 
 dense_5 (Dense)             (None, 30)                208350    
                                                      

In [20]:
model.compile(loss=BinaryCrossentropy(), optimizer=SGD(momentum=0.02))
model.fit(x=train_x,
         y=train_y,
         batch_size=256,
         epochs=50,
         validation_split=0.3,
         callbacks=callbacks)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.77556, saving model to ./models_lstm\checkpoint
INFO:tensorflow:Assets written to: ./models_lstm\checkpoint\assets
Epoch 2/50
Epoch 2: val_loss did not improve from 0.77556
Epoch 3/50
Epoch 3: val_loss did not improve from 0.77556
Epoch 4/50
Epoch 4: val_loss did not improve from 0.77556
Epoch 5/50
Epoch 5: val_loss did not improve from 0.77556
Epoch 6/50
Epoch 6: val_loss did not improve from 0.77556
Epoch 7/50
Epoch 7: val_loss did not improve from 0.77556
Epoch 8/50
Epoch 8: val_loss did not improve from 0.77556
Epoch 9/50
Epoch 9: val_loss did not improve from 0.77556
Epoch 10/50
Epoch 10: val_loss did not improve from 0.77556
Epoch 11/50
Epoch 11: val_loss did not improve from 0.77556
Epoch 12/50
Epoch 12: val_loss did not improve from 0.77556
Epoch 13/50
Epoch 13: val_loss did not improve from 0.77556
Epoch 14/50
Epoch 14: val_loss did not improve from 0.77556
Epoch 15/50
Epoch 15: val_loss did not improve from 0.77556
Epoch 16/

Epoch 39: val_loss did not improve from 0.77556
Epoch 40/50
Epoch 40: val_loss did not improve from 0.77556
Epoch 41/50
Epoch 41: val_loss did not improve from 0.77556
Epoch 42/50
Epoch 42: val_loss did not improve from 0.77556
Epoch 43/50
Epoch 43: val_loss did not improve from 0.77556
Epoch 44/50
Epoch 44: val_loss did not improve from 0.77556
Epoch 45/50
Epoch 45: val_loss did not improve from 0.77556
Epoch 46/50
Epoch 46: val_loss did not improve from 0.77556
Epoch 47/50
Epoch 47: val_loss did not improve from 0.77556
Epoch 48/50
Epoch 48: val_loss did not improve from 0.77556
Epoch 49/50
Epoch 49: val_loss did not improve from 0.77556
Epoch 50/50
Epoch 50: val_loss did not improve from 0.77556


<keras.callbacks.History at 0x1b0a5971700>