# Recurrent Neural Networks (RNN)

In [2]:
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import plotly.express as px

# Train test split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Word2Vec Embedding
import gensim
from gensim.models import Word2Vec
from sklearn.utils import resample


from gensim.models import FastText
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ELMo Embedding
import tensorflow as tf
import tensorflow_hub as hub
import h5py

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator

#RNN
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier 
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Flatten

## Load cleaned dataset

In [3]:
# Read from CSV file
df = pd.read_csv('cleaned_combined_data.csv')
df.head()

Unnamed: 0,Categorization,Body,Label,Cleaned Text,Cleaned Text with N lemmatization,Cleaned Text with V lemmatization,Cleaned Text with A lemmatization
0,Envy to other is swallowing me,"Im from developingcountry, Indonesia , and for...",1,im developingcountry indonesia temporary work ...,im developingcountry indonesia temporary work ...,im developingcountry indonesia temporary work ...,im developingcountry indonesia temporary work ...
1,Nothin outta the ordinary. Paradise. Job stres...,Um hello ....well many can relate im sure. Aft...,1,um hello well many relate im sure today im con...,um hello well many relate im sure today im con...,um hello well many relate im sure today im con...,um hello well many relate im sure today im con...
2,Almost 49 and the chasm of emptiness has never...,I’ve been diagnosed severe bi polar where you ...,1,ive diagnosed severe bi polar longer even get ...,ive diagnosed severe bi polar longer even get ...,ive diagnose severe bi polar longer even get g...,ive diagnosed severe bi polar long even get go...
3,I’m happy again,"After my closest friend left me in April, I ha...",0,closest friend left april finally let go reali...,closest friend left april finally let go reali...,closest friend leave april finally let go real...,close friend left april finally let go realize...
4,Is it possible to recover from such a traumati...,"I am only 15, and yet I feel my life is alread...",1,15 yet feel life already pit emptiness stomach...,15 yet feel life already pit emptiness stomach...,15 yet feel life already pit emptiness stomach...,15 yet feel life already pit emptiness stomach...


In [4]:
df.dropna(inplace=True)

## Hypothesis 1: How does different text lemmatization affect the model results?

In [5]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, make_scorer
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import RandomizedSearchCV

In [6]:
def create_rnn_model(embedding_dim, num_units, learning_rate, vocab_size, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(SimpleRNN(units=num_units, activation='tanh'))
    model.add(Dense(64, activation='tanh'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    return model


def hypothesis1_test(column_name, perform_hyperparameter_tuning=False):
    # Initialize f1_score and recall
    f1 = 0.0
    recall = 0.0
    
    # Split the data into training and testing sets
    X = df[column_name]
    y = df['Label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)

    # Define the vocabulary size
    vocab_size = len(tokenizer.word_index) + 1

    # Set the maximum sequence length (adjust as needed)
    max_sequence_length = 100  # You can choose an appropriate sequence length

    if perform_hyperparameter_tuning:
        # Define the hyperparameter distributions for RandomizedSearchCV
        param_dist = {
            'embedding_dim': np.arange(50, 151, 10),  # Range of values for embedding_dim
            'num_units': np.arange(32, 193, 32),      # Range of values for num_units
            'learning_rate': [1e-2, 1e-3, 1e-4]
        }

        # Create a custom scorer for F1-score using the f1_score function
        recall_scorer = make_scorer(recall_score)

        # Create the RandomizedSearchCV
        rnn_model = KerasClassifier(model=create_rnn_model, vocab_size=vocab_size, max_sequence_length=max_sequence_length, num_units=32, learning_rate=0.01, embedding_dim=50)
        random_search = RandomizedSearchCV(estimator=rnn_model, param_distributions=param_dist, scoring=recall_scorer, cv=5, n_iter=10, verbose=1)

        # Convert text to sequences and pad them to the specified length
        X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_sequence_length)

        # Fit the random search to the training data with numerical features
        random_search.fit(X_train_seq, y_train)

        best_hyperparameters = random_search.best_params_
        # Create the best model with the best hyperparameters
        best_model = create_rnn_model(embedding_dim=best_hyperparameters['embedding_dim'],
                                      num_units=best_hyperparameters['num_units'],
                                      learning_rate=best_hyperparameters['learning_rate'],
                                      vocab_size=vocab_size,
                                      max_sequence_length=max_sequence_length)

        # Compile the best model
        best_model.compile(optimizer=keras.optimizers.Adam(learning_rate=best_hyperparameters['learning_rate']),
                          loss='binary_crossentropy',
                          metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

        # Use the best model to make predictions on the test set
        X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_sequence_length)
        y_test_pred = (best_model.predict(X_test_seq) > 0.5).astype(int)

        # Calculate classification error
        accuracy = accuracy_score(y_test, y_test_pred)
        classification_error = 1 - accuracy

        # Calculate recall, specificity, F1 score, and AUC score
        tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
        recall = tp / (tp + fn)
        specificity = tn / (tn + fp)
        precision = precision_score(y_test, y_test_pred)
        f1 = f1_score(y_test, y_test_pred)

        auc_score = roc_auc_score(y_test, y_test_pred)

        print(f"Test accuracy: {accuracy * 100:.2f}%")
        print(f"Classification Error: {classification_error * 100:.2f}%")
        print(f"Recall: {recall * 100:.2f}%")
        print(f"Specificity: {specificity * 100:.2f}%")
        print(f"Precision: {precision * 100:.2f}%")
        print(f"F1 Score: {f1 * 100:.2f}%")
        print(f"AUC Score: {auc_score * 100:.2f}%")

        print("Best Hyperparameters:")
        print(f"Embedding Dim: {best_hyperparameters.get('embedding_dim')}")
        print(f"Num Units: {best_hyperparameters.get('num_units')}")
        print(f"Learning Rate: {best_hyperparameters.get('learning_rate')}")

    else:

        # Convert text to sequences and pad them to the specified length
        X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_sequence_length)
        X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_sequence_length)

        embedding_dim = 100  # Set your desired embedding dimension

        model = create_rnn_model(embedding_dim=embedding_dim, num_units=32, learning_rate=0.01, vocab_size=vocab_size, max_sequence_length=max_sequence_length)

        # Compile the model
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss='binary_crossentropy',
                      metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

        batch_size = 32  # Set your desired batch size
        num_epochs = 10  # Set the number of training epochs

        model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs)

        result = model.evaluate(X_test, y_test)
        loss = result[0]
        accuracy = result[1]

        # Use the model to make predictions on the test set
        y_pred = model.predict(X_test)
        y_pred = (y_pred > 0.5).astype(int)  # Convert predicted probabilities to binary labels

        # Calculate classification error
        classification_error = 1 - accuracy

        # Calculate recall, specificity, F1 score, and AUC score
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        recall = tp / (tp + fn)
        specificity = tn / (tn + fp)
        precision = precision_score(y_test, y_pred)
        f1 = 2 * (precision * recall) / (precision + recall)
        auc_score = roc_auc_score(y_test, y_pred)

        print(f"Test accuracy: {accuracy * 100:.2f}%")
        print(f"Classification Error: {classification_error * 100:.2f}%")
        print(f"Recall: {recall * 100:.2f}%")
        print(f"Specificity: {specificity * 100:.2f}%")
        print(f"Precision: {precision * 100:.2f}%")
        print(f"F1 Score: {f1 * 100:.2f}%")
        print(f"AUC Score: {auc_score * 100:.2f}%")

        print(classification_report(y_test, y_pred))  # This


In [7]:
hypothesis1_test('Cleaned Text', perform_hyperparameter_tuning=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 70.62%
Classification Error: 29.38%
Recall: 75.35%
Specificity: 64.17%
Precision: 74.18%
F1 Score: 74.76%
AUC Score: 69.76%
              precision    recall  f1-score   support

           0       0.66      0.64      0.65      1214
           1       0.74      0.75      0.75      1659

    accuracy                           0.71      2873
   macro avg       0.70      0.70      0.70      2873
weighted avg       0.71      0.71      0.71      2873



In [8]:
hypothesis1_test('Cleaned Text with N lemmatization', perform_hyperparameter_tuning=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 72.75%
Classification Error: 27.25%
Recall: 77.76%
Specificity: 65.90%
Precision: 75.70%
F1 Score: 76.72%
AUC Score: 71.83%
              precision    recall  f1-score   support

           0       0.68      0.66      0.67      1214
           1       0.76      0.78      0.77      1659

    accuracy                           0.73      2873
   macro avg       0.72      0.72      0.72      2873
weighted avg       0.73      0.73      0.73      2873



In [9]:
hypothesis1_test('Cleaned Text with V lemmatization', perform_hyperparameter_tuning=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 71.98%
Classification Error: 28.02%
Recall: 79.08%
Specificity: 62.27%
Precision: 74.12%
F1 Score: 76.52%
AUC Score: 70.68%
              precision    recall  f1-score   support

           0       0.69      0.62      0.65      1214
           1       0.74      0.79      0.77      1659

    accuracy                           0.72      2873
   macro avg       0.71      0.71      0.71      2873
weighted avg       0.72      0.72      0.72      2873



In [10]:
hypothesis1_test('Cleaned Text with A lemmatization', perform_hyperparameter_tuning=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 71.39%
Classification Error: 28.61%
Recall: 79.57%
Specificity: 60.21%
Precision: 73.21%
F1 Score: 76.26%
AUC Score: 69.89%
              precision    recall  f1-score   support

           0       0.68      0.60      0.64      1214
           1       0.73      0.80      0.76      1659

    accuracy                           0.71      2873
   macro avg       0.71      0.70      0.70      2873
weighted avg       0.71      0.71      0.71      2873



In [11]:
hypothesis1_test('Cleaned Text with A lemmatization', perform_hyperparameter_tuning=True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Test accuracy: 52.70%
Classification Error: 47.30%
Recall: 69.20%
Specificity: 30.15%
Precision: 57.52%
F1 Score: 62.82%
AUC Score: 49.67%
Best Hyperparameters:
Embedding Dim: 110
Num Units: 192
Learning Rate: 0.0001
