# GRU model

## Setup

In [5]:
# !pip install gensim

In [39]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Masking, GRU, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import gensim.downloader as api
from gensim.models import Word2Vec

In [55]:
X = pd.read_csv("../processed_data/features_1000sample_400min_600cutoff.csv")
y = pd.read_csv("../processed_data/target_1000sample_400min_600cutoff.csv")

In [56]:
codes = {p: i for i, p in enumerate(y["party"].unique())}
# y = y["party"].map(codes)
y = OneHotEncoder(sparse_output=False).fit_transform(y["party"].values.reshape(-1, 1))
X = X["text"]

In [57]:
X.shape, y.shape

((7000,), (7000, 7))

## Preprocessing

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Embed the training and test sentences

In [11]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [12]:
word2vec_model = api.load("glove-wiki-gigaword-50")



In [13]:
X_train_embed = embedding(word2vec_model, X_train)
X_test_embed = embedding(word2vec_model, X_test)

### Pad sequences to ensure uniform input size

In [22]:
maxlen = 600  # Maximum sequence length
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=maxlen)

### Tokenize the text data

In [8]:
# tokenizer = Tokenizer(num_words=10000)  # Limit vocabulary size
# tokenizer.fit_on_texts(X_train)
# X_train_seq = tokenizer.texts_to_sequences(X_train)
# X_test_seq = tokenizer.texts_to_sequences(X_test)

## The model

In [24]:
# # Convert words to Word2Vec embeddings
# word_index = tokenizer.word_index
# embedding_matrix = np.zeros((len(word_index) + 1, 300))  # Assuming Word2Vec vectors are 300-dimensional

# for word, i in word_index.items():
#     if word in word2vec_model:
#         embedding_matrix[i] = word2vec_model[word]

# # Define the model
# embedding_layer = tf.keras.layers.Embedding(len(word_index) + 1,
#                                             300,  # Assuming Word2Vec vectors are 300-dimensional
#                                             weights=[embedding_matrix],
#                                             input_length=maxlen,
#                                             trainable=False)  # Freeze the embedding layer

In [67]:
# Define the model
model = Sequential()
model.add(Masking())
model.add(GRU(200, activation="tanh", dropout=0.2, recurrent_dropout=0.2))
# model.add(GRU(100, activation="tanh", dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100, activation="relu"))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(7, activation="softmax"))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping
es = EarlyStopping(patience=3)

In [69]:
# Train the model
model.fit(np.array(X_train_pad), np.array(y_train), epochs=10, batch_size=32, 
          validation_split=0.2, callbacks=[es], shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x7f411f147220>

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')