In [10]:
#general 
import pandas as pd
import numpy as np
import os
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split

#warnings 
import warnings 
warnings.filterwarnings("ignore")

#for RNN
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import load_model


#for performance evaluation 
from sklearn.metrics import confusion_matrix
from pretty_confusion_matrix import pp_matrix
from sklearn.metrics import precision_recall_fscore_support

#for visualization 
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import pickle

**Versions of Packages**

In [11]:
year_list = ['2009', '2010', '2012', '2013', '2014', '2015']
#year_list = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
#open_path = "C:/Users/DanielleDuncan/Desktop/THESIS/RNN pp data/"
open_path = "C:/Users/danie/Desktop/Masters Thesis/New Clean Data for Log Reg/"
#save_path = "C:/Users/DanielleDuncan/Desktop/THESIS/New RNN Models/"
save_path = "C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/"

In [12]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [13]:
for year in year_list:

    df = pd.read_pickle(open_path + year + "_final_RNN.pickle")

    #apply function to only get rows with an absolute count 
    df['col_type'] = df.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

    #remove nulls 
    df = df[df["col_type"].notnull()]

    # Split the df by the preprocessed 
    x_train, x_test, y_train, y_test = train_test_split(df["pre_processed_sent"], 
                                                        df["col_type"], 
                                                        stratify = df["col_type"])

    xTrain, xTest, yTrain, yTest = train_test_split(df["string_rnn"],
                                                    df["col_type"], 
                                                    stratify = df["col_type"])

    train_dataset = tf.data.Dataset.from_tensor_slices((xTrain, yTrain)) #string_rnn here 
    test_dataset = tf.data.Dataset.from_tensor_slices((xTest, yTest)) #clean text 

    buffer_size = 50000 
    batch_size = 64 # best practice
    
    train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


    vocab_size = 60000 #this is 1/12 of all the words in the english language 
    encoder = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
    encoder.adapt(train_dataset.map(lambda text, label: text))

    encoded_vocab = np.array(encoder.get_vocabulary())
    vocab_dict = dict(enumerate(encoded_vocab))

    rnn2 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 256, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(2)])

    rnn2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                optimizer=tf.keras.optimizers.Adam(1e-4),
                metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    fitted_model = rnn2.fit(train_dataset, epochs=8,
                        validation_data=test_dataset, 
                        callbacks = [early_stop])

    rnn2.save(save_path + year + "_RNN")

    # Convert y_true and y_pred to numpy arrays
    y_true = np.concatenate([y.numpy() for _, y in test_dataset], axis=0)
    y_pred = np.concatenate([rnn2.predict(x).argmax(axis=-1) for x, _ in test_dataset], axis=0)

    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    #define the positive class as the female class 
    positive_class =1 

    #define counts for each element of confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels = [0, positive_class]).ravel()
    results_cm = {' ': ['True Negative', 'False Positive', 'False Negative', 'True Positive'],
            'Counts': [tn, fp, fn, tp]}

    df_cm = pd.DataFrame(results_cm)
    df_cm.set_index(' ', inplace=True)

    precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, pos_label=1)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', f1_score)
    print(df_cm)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8




INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2009_RNN\assets


INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2009_RNN\assets


Precision: [0.8996139  0.81167608]
Recall: [0.87484355 0.84675835]
F1 score: [0.88705584 0.82884615]
                Counts
                      
True Negative      699
False Positive     100
False Negative      78
True Positive      431
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8




INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2010_RNN\assets


INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2010_RNN\assets


Precision: [0.85816821 0.74775217]
Recall: [0.94204764 0.52457604]
F1 score: [0.89815378 0.6165907 ]
                Counts
                      
True Negative    83456
False Positive    5134
False Negative   13793
True Positive    15219
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8




INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2012_RNN\assets


INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2012_RNN\assets


Precision: [0.84021075 0.70082119]
Recall: [0.93082751 0.47789881]
F1 score: [0.88320088 0.56828019]
                Counts
                      
True Negative   151979
False Positive   11294
False Negative   28903
True Positive    26456
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8




INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2013_RNN\assets


INFO:tensorflow:Assets written to: C:/Users/danie/Desktop/Masters Thesis/RNN Models 2/2013_RNN\assets


Precision: [0.83670566 0.70228751]
Recall: [0.92618743 0.49064738]
F1 score: [0.87917557 0.57769377]
                Counts
                      
True Negative   144689
False Positive   11531
False Negative   28238
True Positive    27201
Epoch 1/8