In [None]:
#general 
import pandas as pd
import numpy as np
import os
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split

#warnings 
import warnings 
warnings.filterwarnings("ignore")

#for RNN
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import load_model


#for performance evaluation 
from sklearn.metrics import confusion_matrix
from pretty_confusion_matrix import pp_matrix
from sklearn.metrics import precision_recall_fscore_support

#for visualization 
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import pickle

**Versions of Packages**

In [None]:
#year_list = ['2009', '2010', '2012', '2013', '2014', '2015']
year_list = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
open_path = "C:/Users/danie/Desktop/Masters Thesis/New Clean Data for Log Reg/"
save_path = "C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/"

In [None]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [None]:
#newmodel - BEST ONE SO FAR!!
rnn2 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 256, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(2)
])

In [None]:
for year in year_list:

    df = pd.read_pickle(open_path + year + "_final_RNN.pickle")

    #apply function to only get rows with an absolute count 
    df['col_type'] = df.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

    #remove nulls 
    df = df[df["col_type"].notnull()]

    # Split the df by the preprocessed 
    x_train, x_test, y_train, y_test = train_test_split(df["pre_processed_sent"], 
                                                        df["col_type"], 
                                                        stratify = df["col_type"])

    xTrain, xTest, yTrain, yTest = train_test_split(df["string_rnn"],
                                                    df["col_type"], 
                                                    stratify = df["col_type"])

    train_dataset = tf.data.Dataset.from_tensor_slices((xTrain, yTrain)) #string_rnn here 
    test_dataset = tf.data.Dataset.from_tensor_slices((xTest, yTest)) #clean text 

    buffer_size = 50000 
    batch_size = 64 # best practice
    
    train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


    vocab_size = 60000 #this is 1/12 of all the words in the english language 
    encoder = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
    encoder.adapt(train_dataset.map(lambda text, label: text))

    encoded_vocab = np.array(encoder.get_vocabulary())
    vocab_dict = dict(enumerate(encoded_vocab))

    rnn2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                optimizer=tf.keras.optimizers.Adam(1e-4),
                metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    fitted_model = rnn2.fit(train_dataset, epochs=8,
                        validation_data=test_dataset, 
                        callbacks = [early_stop])

    rnn2.save(save_path + year + "_RNN")