# RNN text classification

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding

## Import Data

In [2]:
movies = pd.read_csv('./data/movies_cleaned.csv')

In [3]:
# gonna start with a simple binary classification task using the two most common genres
data = movies[movies['Genre'].isin(['comedy', 'drama'])].copy(deep=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10343 entries, 1 to 20956
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      10343 non-null  int64 
 1   Title             10343 non-null  object
 2   Origin/Ethnicity  10343 non-null  object
 3   Director          10343 non-null  object
 4   Genre             10343 non-null  object
 5   Plot              10343 non-null  object
 6   plot_length       10343 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 646.4+ KB


## Baseline

In [5]:
data['Genre'].value_counts(normalize=True)

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

drama     0.576622
comedy    0.423378
Name: Genre, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['Plot'], data['Genre'])

## Prepare text

In [7]:
tokenizer = Tokenizer(1000)
tokenizer.fit_on_texts(X_train)

In [8]:
X_train_token = tokenizer.texts_to_sequences(X_train)

In [9]:
X_test_token = tokenizer.texts_to_sequences(X_test)

In [10]:
data['plot_length'].max()

2958

In [21]:
X_train_seq = pad_sequences(X_train_token, maxlen = 1000)

In [22]:
X_test_seq = pad_sequences(X_test_token, maxlen = 1000)

In [13]:
y_train_token = np.where((y_train == 'drama'), 1, 0)

In [14]:
y_test_token = np.where((y_test == 'drama'), 1, 0)

In [15]:
y_train_token

array([1, 1, 1, ..., 0, 0, 1])

## Model

### RNN

In [23]:
model = Sequential()
model.add(Embedding(input_dim = tokenizer.num_words, output_dim = 64))
model.add(SimpleRNN(16))
model.add(Dense(200, activation = 'relu'))
model.add(Dense(200, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [24]:
model.compile(loss='bce', optimizer = 'adam', metrics = ['acc'])

In [26]:
history = model.fit(X_train_seq, y_train_token, epochs = 10, validation_data=(X_test_seq, y_test_token))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### LSTM

In [27]:
model2 = Sequential()
model2.add(Embedding(input_dim = tokenizer.num_words, output_dim = 64))
model2.add(LSTM(16))
model2.add(Dense(200, activation = 'relu'))
model2.add(Dense(200, activation = 'relu'))
model2.add(Dense(1, activation = 'sigmoid'))
model2.compile(loss='bce', optimizer = 'adam', metrics = ['acc'])

In [28]:
history2 = model2.fit(X_train_seq, y_train_token, epochs = 10, validation_data=(X_test_seq, y_test_token))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### GRU

In [30]:
model3 = Sequential()
model3.add(Embedding(input_dim = tokenizer.num_words, output_dim = 64))
model3.add(GRU(16))
model3.add(Dense(200, activation = 'relu'))
model3.add(Dense(200, activation = 'relu'))
model3.add(Dense(1, activation = 'sigmoid'))
model3.compile(loss='bce', optimizer = 'adam', metrics = ['acc'])

In [31]:
history3 = model3.fit(X_train_seq, y_train_token, epochs = 10, validation_data=(X_test_seq, y_test_token))

Epoch 1/10
Epoch 2/10
  6/243 [..............................] - ETA: 3:10 - loss: 0.6622 - acc: 0.6447

KeyboardInterrupt: 

## Gonna optimize my LSTM model

In [32]:
X_train_seq2 = pad_sequences(X_train_token, maxlen = data['plot_length'].max())
X_test_seq2 = pad_sequences(X_test_token, maxlen = data['plot_length'].max())

In [35]:
from tensorflow.keras.layers import Dropout

In [36]:
model4 = Sequential()
model4.add(Embedding(input_dim = tokenizer.num_words, output_dim = 64))
model4.add(LSTM(16))
model4.add(Dense(200, activation = 'relu'))
model4.add(Dropout(0.2))
model4.add(Dense(200, activation = 'relu'))
model4.add(Dropout(0.2))
model4.add(Dense(1, activation = 'sigmoid'))
model4.compile(loss='bce', optimizer = 'adam', metrics = ['acc'])

In [37]:
history4 = model4.fit(X_train_seq2, y_train_token, epochs = 10, validation_data=(X_test_seq2, y_test_token))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
