# Long Short-Term Memory (LSTM)

In [1]:
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import plotly.express as px

# Train test split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Word2Vec Embedding
import gensim
from gensim.models import Word2Vec
from sklearn.utils import resample


from gensim.models import FastText
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import KeyedVectors

from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ELMo Embedding
import tensorflow as tf
import tensorflow_hub as hub
import h5py

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import make_scorer, accuracy_score, f1_score

# LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import random
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
# below here might have same instance
import numpy as np
import statistics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve
import plotly.express as px
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense







## Load Cleaned Dataset

In [2]:
# Read from CSV file
df = pd.read_csv('cleaned_combined_data.csv')
df.head()

Unnamed: 0,Categorization,Body,Label,Cleaned Text,Cleaned Text with N lemmatization,Cleaned Text with V lemmatization,Cleaned Text with A lemmatization
0,Envy to other is swallowing me,"Im from developingcountry, Indonesia , and for...",1,im developingcountry indonesia temporary work ...,im developingcountry indonesia temporary work ...,im developingcountry indonesia temporary work ...,im developingcountry indonesia temporary work ...
1,Nothin outta the ordinary. Paradise. Job stres...,Um hello ....well many can relate im sure. Aft...,1,um hello well many relate im sure today im con...,um hello well many relate im sure today im con...,um hello well many relate im sure today im con...,um hello well many relate im sure today im con...
2,Almost 49 and the chasm of emptiness has never...,I’ve been diagnosed severe bi polar where you ...,1,ive diagnosed severe bi polar longer even get ...,ive diagnosed severe bi polar longer even get ...,ive diagnose severe bi polar longer even get g...,ive diagnosed severe bi polar long even get go...
3,I’m happy again,"After my closest friend left me in April, I ha...",0,closest friend left april finally let go reali...,closest friend left april finally let go reali...,closest friend leave april finally let go real...,close friend left april finally let go realize...
4,Is it possible to recover from such a traumati...,"I am only 15, and yet I feel my life is alread...",1,15 yet feel life already pit emptiness stomach...,15 yet feel life already pit emptiness stomach...,15 yet feel life already pit emptiness stomach...,15 yet feel life already pit emptiness stomach...


In [3]:
rows_with_nan = df[df.isna().any(axis=1)]
print(rows_with_nan)

                               Categorization Body  Label Cleaned Text  \
796  I wish Somebody could understand my pain    (      1          NaN   

    Cleaned Text with N lemmatization Cleaned Text with V lemmatization  \
796                               NaN                               NaN   

    Cleaned Text with A lemmatization  
796                               NaN  


In [4]:
# Drop rows with NaN values in place
df.dropna(inplace=True)

## Hypothesis 1: How does different text lemmatization affect the model results?

There are different parts of speech (pos) to lemmatize by
1. Noun (pos = 'n'):
- "cats" -> "cat"
- "dogs" -> "dog"
- "books" -> "book" 
2. Verb (pos = 'v')
- "running" -> "run"
- "eating" -> "eat"
- "swimming" -> "swim"
3. Adjectives (pos = "a")
- better -> "good"
- "happier" -> 'happy'
- "brighter" -> "bright"

We have 4 sets of columns - "Cleaned Text", "Cleaned Text with N lemmatization", "Cleaned Text with V lemmatization", "Cleaned Text with A lemmatization"

We'll now how lemmatization affects model results

### Model Function

Use of function for ease of model reuse
 
**Column used: Function input**

<u>**Standardised variables**</u>
- **Model: LSTM**
- **Feature Extraction: CountVectorizer**
- **Feature Transformation: TfidfTransformer** 

<u>**Steps**</u>
1. Process data into numerical representation
2. Fit Model
3. Get results


In [6]:
def model_for_hypothesis_1(column_name, df):
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

    X = df[column_name]
    y = df['Label']

    count_vectorizer = CountVectorizer()
    X_counts = count_vectorizer.fit_transform(X)

    tfidf_transformer = TfidfTransformer()
    X_tfidf = tfidf_transformer.fit_transform(X_counts)

    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

    X_train = X_train.toarray().reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test = X_test.toarray().reshape(X_test.shape[0], 1, X_test.shape[1])

    model = Sequential()
    model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

    y_pred = model.predict(X_test)
    y_pred = [1 if val > 0.5 else 0 for val in y_pred]

    confusion = confusion_matrix(y_test, y_pred)
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    TP = confusion[1, 1]

    accuracy = accuracy_score(y_test, y_pred)
    classification_error = 1 - accuracy
    sensitivity = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    specificity = TN / (TN + FP)
    f1 = f1_score(y_test, y_pred)
    auc_roc_score = roc_auc_score(y_test, y_pred)

    print('Accuracy: {:.2f}%'.format(accuracy * 100))
    print('Classification Error: {:.2f}%'.format(classification_error * 100))
    print('Sensitivity: {:.2f}%'.format(sensitivity * 100))
    print('Precision: {:.2f}%'.format(precision * 100))
    print('Specificity: {:.2f}%'.format(specificity * 100))
    print('F1 Score: {:.2f}%'.format(f1 * 100))
    print('AUC Score: {:.2f}%'.format(auc_roc_score * 100))

### Base model

- **Column used: "Cleaned Text"**
Description:
"Cleaned Text" is preprocessed data without text lemmatization

In [7]:
model_for_hypothesis_1("Cleaned Text", df)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 79.67%
Classification Error: 20.33%
Sensitivity: 83.97%
Precision: 81.41%
Specificity: 73.81%
F1 Score: 82.67%
AUC Score: 78.89%


### Model 1

- **Column used: "Cleaned Text with N lemmatization"**
Description:
"Cleaned Text with N lemmatization" is preprocessed data text lemmatization by Nouns

In [8]:
model_for_hypothesis_1("Cleaned Text with N lemmatization", df)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 79.67%
Classification Error: 20.33%
Sensitivity: 84.09%
Precision: 81.34%
Specificity: 73.64%
F1 Score: 82.69%
AUC Score: 78.86%


### Model 2

- **Column used: "Cleaned Text with V lemmatization"**
Description:
"Cleaned Text with V lemmatization" is preprocessed data with text lemmatization by verbs

In [9]:
model_for_hypothesis_1("Cleaned Text with V lemmatization", df)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 79.74%
Classification Error: 20.26%
Sensitivity: 84.27%
Precision: 81.33%
Specificity: 73.56%
F1 Score: 82.77%
AUC Score: 78.91%


### Model 3

- **Column used: "Cleaned Text with A lemmatization"**
Description:
"Cleaned Text with A lemmatization" is preprocessed data with text lemmatization by adjectives

In [10]:
model_for_hypothesis_1("Cleaned Text with A lemmatization", df)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 79.67%
Classification Error: 20.33%
Sensitivity: 84.15%
Precision: 81.30%
Specificity: 73.56%
F1 Score: 82.70%
AUC Score: 78.85%


#### Conclusion
Out of different lemmatization, we see that V Lemmantization has the best results in terms of sensitivity(recall), 84.27%, accuracy, 79.74%, and f1-score at 82.77% <br>
Thus we will be using V Lemmantization for the rest of this notebook.


## Hypothesis 2: How does different text extraction methods affect the results?

<u>**Different Text Extraction methods**</u>

1. **Bag of Words (BoW) with TF-IDF transformation**: It creates a vector of word counts for each document.TF-IDF is used to assign weights to words based on their frequency within a document and their importance in the entire corpus. This step converts the BoW features into TF-IDF features.

2. **Word Embeddings**: Represent words as dense vectors in a continuous vector space. Word embeddings can be averaged or combined to represent documents.

    - Examples of Word embedding models: Word2Vec, GloVe, and FastText 
    
3. **Doc2Vec**: An extension of Word2Vec that learns document-level embeddings, allowing you to represent entire documents as vectors.


### Model Function

Use of function for ease of model reuse
 

<u>**Steps**</u>
1. Use of Stratified K-fold to split data
2. Fit Model
3. Get results


In [5]:
def perform_kfold_cross_validation_with_lstm(word_embeddings, labels):
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

    num_of_folds = 10
    skf = StratifiedKFold(n_splits=num_of_folds, shuffle=True, random_state=460)

    # List of metrics for each fold
    k_fold_accuracy = []
    k_fold_classification_error = []
    k_fold_sensitivity = []
    k_fold_precision = []
    k_fold_specificity = []
    k_fold_f1_score = []
    auc_roc_scores = []
    fpr_values = []
    tpr_values = []

    for train_index, test_index in skf.split(word_embeddings, labels):
        X_train, X_test = word_embeddings[train_index], word_embeddings[test_index]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

        vectorized_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
        vectorized_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

        model = Sequential()
        model.add(LSTM(100, input_shape=(vectorized_train.shape[1], vectorized_train.shape[2])))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        model.fit(vectorized_train, y_train, epochs=5, batch_size=64, verbose=1)
        y_pred = (model.predict(vectorized_test) > 0.5).astype("int32")

        # Calculating metrics for evaluation
        confusion = confusion_matrix(y_test, y_pred)
        TN, FP, FN, TP = confusion[0, 0], confusion[0, 1], confusion[1, 0], confusion[1, 1]
        accuracy = accuracy_score(y_test, y_pred)
        classification_error = 1 - accuracy
        sensitivity = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        specificity = TN / (TN + FP)
        f1_score = (2 * sensitivity * precision) / (sensitivity + precision)
        y_pred_proba = model.predict(vectorized_test)
        auc_roc_score = roc_auc_score(y_test, y_pred_proba)

        # Appending metrics to lists
        k_fold_accuracy.append(round(accuracy, 4))
        k_fold_classification_error.append(round(classification_error, 4))
        k_fold_sensitivity.append(round(sensitivity, 4))
        k_fold_precision.append(round(precision, 4))
        k_fold_specificity.append(round(specificity, 4))
        k_fold_f1_score.append(round(f1_score, 4))
        auc_roc_scores.append(auc_roc_score)

        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        fpr_values.append(fpr)
        tpr_values.append(tpr)

    print('The average accuracy is: {:.2f}%'.format(statistics.mean(k_fold_accuracy) * 100))
    print('The average classification error is: {:.2f}%'.format(statistics.mean(k_fold_classification_error) * 100))
    print('The average sensitivity is: {:.2f}%'.format(statistics.mean(k_fold_sensitivity) * 100))
    print('The average precision is: {:.2f}%'.format(statistics.mean(k_fold_precision) * 100))
    print('The average specificity is: {:.2f}%'.format(statistics.mean(k_fold_specificity) * 100))
    print('The average f1 score is: {:.2f}%'.format(statistics.mean(k_fold_f1_score) * 100))

    weights = [len(test_index) / len(labels) for _, test_index in skf.split(word_embeddings, labels)]
    avg_auc_roc = np.average(auc_roc_scores, weights=weights, axis=0)
    print('The average AUC Score is: {:.2f}%'.format(avg_auc_roc * 100))

    max_length_fpr = max(len(a) for a in fpr_values)
    max_length_tpr = max(len(a) for a in tpr_values)
    padded_fpr = np.array([np.pad(a, (0, max_length_fpr - len(a)), mode='constant') for a in fpr_values])
    padded_tpr = np.array([np.pad(a, (0, max_length_tpr - len(a)), mode='constant') for a in tpr_values])

    avg_fpr_value = np.average(padded_fpr, weights=weights, axis=0)
    avg_tpr_value = np.average(padded_tpr, weights=weights, axis=0)

    fig = px.area(
        x=avg_fpr_value, y=avg_tpr_value,
        title=f'ROC Curve (AUC={avg_auc_roc:.4f})',
        labels=dict(x='Average False Positive Rate', y='Average True Positive Rate'),
        width=700, height=700
    )

    fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()

    x_axis = [i for i in range(1, num_of_folds + 1)]
    fig = px.scatter(x=x_axis, y=auc_roc_scores,
                     labels={"x": "K-Fold", "y": "AUC Score"},
                     trendline='ols',
                     title='AUC Values for the each K-Fold'
                     )
    fig.show()

### Base Model

- **Column used: "Cleaned Text with V lemmatization"**
- **Model: LSTM**
- **Feature Extraction: Bag of Words (BoW) with TF-IDF transformation**
Explanation:
CountVectorizer is a feature extraction step. It converts text data into a matrix of word counts, where each row represents a document, and each column represents a unique word or token in the corpus. This step extracts features from the text.
- **Feature Transformation: TfidfTransformer** 
Explanation:
TfidfTransformer is a feature transformation step. It takes the count matrix produced by CountVectorizer and applies the TF-IDF (Term Frequency-Inverse Document Frequency) transformation to it. This transformation adjusts the word counts to emphasize the importance of words in the text data. It doesn't extract new features but transforms the existing features.




In [14]:
X1 = df['Cleaned Text with V lemmatization'] # features
y = df['Label']

vectorizer = CountVectorizer()
X1_counts = vectorizer.fit_transform(X1)

# Creating a TfidfTransformer instance
transformer = TfidfTransformer()
X1_tfidf = transformer.fit_transform(X1_counts)

# Convert the result to a DataFrame
feature_names = vectorizer.get_feature_names_out()
X1_tfidf_df = pd.DataFrame(X1_tfidf.toarray(), columns=feature_names)

perform_kfold_cross_validation_with_lstm(X1_tfidf_df.to_numpy(), y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 81.00%
The average classification error is: 19.00%
The average sensitivity is: 85.08%
The average precision is: 83.28%
The average specificity is: 75.05%
The average f1 score is: 84.16%
The average AUC Score is: 88.46%


### Model 1: Use GloVe Embedding (Version 1)

- **Column used: "Cleaned Text with V lemmatization"**
- **Model: LSTM**
- **Feature Extraction: GloVe Embedding**
Description:

GloVe is designed to capture the semantic relationships and meaning of words based on their co-occurrence statistics in large text corpora. Its key advantage is its ability to capture word semantics and relationships through the distributional information in large text corpora. The resulting word vectors can be used as features in NLP 

**Approach: Average Word Embeddings with GloVe**

Explanation:
The approach calculates the average word embeddings for each text sequence. Each document is represented as an average of word vectors.

Pros:
Dimensionality reduction: The feature matrix has a lower dimensionality compared to direct document vectors.
Simplicity: Averages can capture the overall meaning of the text while reducing the feature space.

Cons:
Loss of word order: Averaging word embeddings doesn't capture word order information, which might be important in some cases.
May not capture all nuances: Averaging can simplify the representation and might not capture nuanced differences in text.



**Feature Transformation: nil** 

Explanation:
There isn't a for need additional feature transformation steps like TF-IDF, PCA, or LDA because the word embeddings themselves already capture a dense, distributed representation of words. GloVe word embeddings inherently contain semantic and contextual information.

In the case of GloVe embeddings, each word is represented as a high-dimensional vector, and these vectors capture relationships between words based on their co-occurrence statistics in the training corpus. This means that words with similar meanings or contexts will have similar vector representations.

Therefore, we can directly use GloVe word embeddings as features for your text classification model without the need for feature transformation steps

**Pre-Trained GloVe Embedding Model**

The pre-trained GLoVe embedding we used was trained on twitter datasets. We decided that this would be most appropriate as our project is involving social media posts and comments.

Link to GloVe Embeddings: https://nlp.stanford.edu/projects/glove/


#### Version 1
In this version, we experimented calculating the average word embeddings for each text sequence. Each document is represented as an average of word vectors. We decided to explore how well the individual words can help us to predict whether a post will be classified as stressed.


In [16]:
# Tokenize the text and pad sequences
texts = df['Cleaned Text with V lemmatization']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxLen = 150
sequences = pad_sequences(sequences, maxlen=maxLen)

# Create an embedding matrix
vocab_len = len(tokenizer.word_index) + 1
embed_vector_len = 100
embedding_matrix = np.zeros((vocab_len, embed_vector_len))

# Load GloVe embeddings
glove_path = 'glove.twitter.27B.100d.txt' 
glove_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_index[word] = vector

# Create a feature matrix with average word embeddings using GloVe
feature_matrix = np.array([
    np.mean([glove_index.get(word, np.zeros(embed_vector_len)) for word in tokenizer.sequences_to_texts([seq])[0].split()], axis=0)
    for seq in sequences
])

In [17]:
y = df['Label']
perform_kfold_cross_validation_with_lstm(feature_matrix, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 80.26%
The average classification error is: 19.74%
The average sensitivity is: 86.69%
The average precision is: 81.31%
The average specificity is: 70.88%
The average f1 score is: 83.91%
The average AUC Score is: 87.55%


#### Experimenting with different vector sizes
Since GloVe embeddings are pretrained with specific parameters (window,min_count,vector_size) that can't be changed, we ran our model with different vector sizes of 50, 100 and 200 to compare which vector size would be most appropriate.

##### Vector Size of 50

In [19]:
# Tokenize the text and pad sequences
texts = df['Cleaned Text with V lemmatization']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxLen = 150
sequences = pad_sequences(sequences, maxlen=maxLen)

# Create an embedding matrix
vocab_len = len(tokenizer.word_index) + 1
embed_vector_len = 50
embedding_matrix = np.zeros((vocab_len, embed_vector_len))

# Load GloVe embeddings
glove_path = 'glove.twitter.27B.50d.txt' 
glove_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_index[word] = vector

# Create a feature matrix with average word embeddings using GloVe
feature_matrix = np.array([
    np.mean([glove_index.get(word, np.zeros(embed_vector_len)) for word in tokenizer.sequences_to_texts([seq])[0].split()], axis=0)
    for seq in sequences
])

y = df['Label']
perform_kfold_cross_validation_with_lstm(feature_matrix, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 79.06%
The average classification error is: 20.94%
The average sensitivity is: 86.10%
The average precision is: 80.14%
The average specificity is: 68.78%
The average f1 score is: 82.99%
The average AUC Score is: 85.97%


##### Vector Size of 200

In [22]:
# Tokenize the text and pad sequences
texts = df['Cleaned Text with V lemmatization']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxLen = 150
sequences = pad_sequences(sequences, maxlen=maxLen)

# Create an embedding matrix
vocab_len = len(tokenizer.word_index) + 1
embed_vector_len = 200
embedding_matrix = np.zeros((vocab_len, embed_vector_len))

# Load GloVe embeddings
glove_path = 'glove.twitter.27B.200d.txt' 
glove_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_index[word] = vector

# Create a feature matrix with average word embeddings using GloVe
feature_matrix = np.array([
    np.mean([glove_index.get(word, np.zeros(embed_vector_len)) for word in tokenizer.sequences_to_texts([seq])[0].split()], axis=0)
    for seq in sequences
])

y = df['Label']
perform_kfold_cross_validation_with_lstm(feature_matrix, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 81.23%
The average classification error is: 18.77%
The average sensitivity is: 87.23%
The average precision is: 82.25%
The average specificity is: 72.48%
The average f1 score is: 84.65%
The average AUC Score is: 88.61%


##### Conclusion
As expected, we achieved better results in both accuracy and F1-score, when using the largest vector size of 200(glove.twitter.27B.200d.txt). However, we have decided to stick with the glove embedding file with **100 dimensions(glove.twitter.27B.100d.txt)**, and vector size of 100 for all models, although it has a lower accuracy(80.26%), sensitivity(86.69%), and F1-score(83.91%) than the vector size of 200(accuracy score:81.23%, sensitivity: 87.23%  F1-score:84.65%).

This is because our team agreed that we have started to observe diminishing returns with the increase in vector size, and the mere increase is not worth at the cost of using the larger file which is more computationally intensive, and will increase the complexity of model training.

### Model 2: Use GloVe Embedding (Version 2)

- **Column used: "Cleaned Text with V lemmatization"**
- **Model: LSTM**
- **Feature Extraction: GloVe Embedding**
Description:

GloVe is designed to capture the semantic relationships and meaning of words based on their co-occurrence statistics in large text corpora. Its key advantage is its ability to capture word semantics and relationships through the distributional information in large text corpora. The resulting word vectors can be used as features in NLP 
- **Approach: Direct GloVe Document Vectors**
Explanation:
The approach directly used GloVe embeddings for feature extraction. Each document or text sequence was represented as a single vector.

Pros:
Preserves semantic information: Using the entire vector representation of a document captures semantic relationships between words.
May capture nuances in the text that average word embeddings might miss.

Cons:
Larger feature space: Each document is represented as a high-dimensional vector, which can lead to a higher dimensionality.



- **Feature Transformation: nil** 
Explanation:
There isn't a for need additional feature transformation steps like TF-IDF, PCA, or LDA because the word embeddings themselves already capture a dense, distributed representation of words. GloVe word embeddings inherently contain semantic and contextual information.

In the case of GloVe embeddings, each word is represented as a high-dimensional vector, and these vectors capture relationships between words based on their co-occurrence statistics in the training corpus. This means that words with similar meanings or contexts will have similar vector representations.

Therefore, we can directly use GloVe word embeddings as features for your text classification model without the need for feature transformation steps





#### Version 2
In this version, we experimented with representing each document as a single vector by directly using the pre-trained Glove Embeddings. Our thought process is that this version should perform better because it retains the semantic information of the entire document.


In [23]:
# Tokenize the text and pad sequences
texts = df['Cleaned Text with V lemmatization']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxLen = 150
sequences = pad_sequences(sequences, maxlen=maxLen)

# Create an embedding matrix
vocab_len = len(tokenizer.word_index) + 1
embed_vector_len = 100
embedding_matrix = np.zeros((vocab_len, embed_vector_len))

# Load GloVe embeddings
glove_path = 'glove.twitter.27B.100d.txt'  # Replace with your GloVe file path
glove_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_index[word] = vector

# Create document vectors using GloVe embeddings
document_vectors = []
for text in df['Cleaned Text with V lemmatization']:
    words = text.split()
    vectors = [glove_index.get(word, np.zeros(embed_vector_len)) for word in words]
    doc_vector = np.mean(vectors, axis=0)
    document_vectors.append(doc_vector)

# Convert the list of document vectors to a NumPy array
feature_matrix = np.array(document_vectors)


In [24]:
y = df['Label']
perform_kfold_cross_validation_with_lstm(feature_matrix, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 80.23%
The average classification error is: 19.77%
The average sensitivity is: 86.74%
The average precision is: 81.23%
The average specificity is: 70.71%
The average f1 score is: 83.89%
The average AUC Score is: 87.54%


#### Interesting Observations

While the second version did only slightly better for sensitivity at 86.74% from 86.69%, we noted a drop in accuracy, from 80.26% to 80.23%, and f1 score, from 83.91%  to 83.89%. <br> We hypothesised that this could be due to the fact that averaging the word embeddings already capture most of the semantic information in the text, or that LSTM is better suited for the first version.

### Model 3: Word2Vec

- **Column used: "Cleaned Text with V lemmatization"**
- **Model: LSTM**
- **Feature Extraction: Word2Vec Embedding by Google**

Description:

Word2Vec is a powerful natural language processing technique for capturing semantic relationships and word context within text data. It generates word embeddings, which are dense vector representations of words, by considering the co-occurrence patterns of words in a large corpus of text. These word embeddings encode semantic information and can be leveraged as feature vectors for various text processing tasks. 

While we thought it would have been more interesting to train our own Word2Vec model, it was too computationally intensive for our CPU, and we decided to capitalise on Google's pre-trained vectors that have been trained on about 100 billion words from the Google News dataset. The model has 300-dimensional vectors for about 3 million words and phrases.

Link to Pre-trained Word2Vec Embeddings: https://code.google.com/archive/p/word2vec/

Pros:

Semantic understanding: Captures semantic relationships between words, and words with similar meanings are represented as vectors that are close in the vector space. This enhances the ability of Word2Vec to capture the meaning of words and phrases effectively.

Contextual information: Considers the context in which words appear, enabling it to distinguish between different uses of the same word based on the neighbouring words. This contextual understanding is important for tasks like word analogy and text similarity.

Smaller feature space: Unlike traditional bag-of-words models, Word2Vec generates relatively low-dimensional vectors, making it computationally efficient and well-suited for downstream machine learning models.

Cons:

Data dependency: The effectiveness is highly dependent on the quality and quantity of the training data. It requires a substantially large amount of text data to capture meaningful word relationships, which may not be available in certain cases.

Out-of-vocabulary words: Word2Vec may have difficulty with out-of-vocabulary words if they are not seen during training. Handling such words requires additional techniques or using subword embeddings like FastText.

Feature Transformation: None

Explanation:

Word2Vec embeddings provide a rich and semantically meaningful representation of words without the need for additional feature transformation. These embeddings are ready for use in various natural language processing tasks, including text classification, sentiment analysis, and document similarity, where capturing the meaning and context of words is crucial for accurate results.


In [26]:
# Load the pre-trained Word2Vec model by Google
pretrained_model_path = 'GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

# Tokenize and pad text
texts = df['Cleaned Text with V lemmatization']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
maxLen = 150
sequences = pad_sequences(sequences, maxlen=maxLen)

# Create an embedding matrix
vocab_len = len(tokenizer.word_index) + 1
embed_vector_len = 300  
embedding_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_vector = word2vec_model[word]
        embedding_matrix[index] = embedding_vector

# Create a feature matrix with average word embeddings
feature_matrix = np.array([
    np.mean([embedding_matrix[word_idx] for word_idx in seq if word_idx != 0], axis=0)
    for seq in sequences
])

y = df['Label']
perform_kfold_cross_validation_with_lstm(feature_matrix, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 80.83%
The average classification error is: 19.17%
The average sensitivity is: 87.12%
The average precision is: 81.81%
The average specificity is: 71.66%
The average f1 score is: 84.36%
The average AUC Score is: 88.19%


### Model 4: FastText

- **Column used: "Cleaned Text with N lemmatization"**
- **Model: Logistic Regression**
- **Feature Extraction: FastText Embedding**

Description:

FastText is an effective method for capturing the semantics and context of words through subword information. It excels in handling out-of-vocabulary words and languages with rich morphology. FastText generates word embeddings based on character-level n-grams and their co-occurrence statistics in text data. These embeddings are valuable features for natural language processing tasks.

Pros:
Subword information: FastText can handle out-of-vocabulary words and complex word forms, making it suitable for diverse languages.
Contextual understanding: FastText embeddings capture the semantics and context of words, offering a rich feature representation for text data.

Cons:
Larger feature space: Each document is represented as a high-dimensional vector, potentially leading to a higher dimensionality.



- **Feature Transformation: nil** 
Explanation:
No additional feature transformation steps are necessary. FastText word embeddings inherently capture subword information, semantics, and contextual understanding. These embeddings offer a dense, distributed representation of words, enabling direct use as features for text classification without the need for further feature transformation.





In [14]:
# Tokenize the text (assuming "cleaned_text" contains your preprocessed text data)
sentences = df["Cleaned Text with V lemmatization"].apply(lambda x: x.split())

# Train FastText model
fasttext_model = FastText(sentences, vector_size=300, window=10, min_count=1, sg=1)

# Create a feature matrix
features = np.array([np.mean([fasttext_model.wv[word] for word in sentence if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for sentence in sentences])

In [15]:
y = df['Label']
perform_kfold_cross_validation_with_lstm(features, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 80.63%
The average classification error is: 19.37%
The average sensitivity is: 87.37%
The average precision is: 81.48%
The average specificity is: 70.80%
The average f1 score is: 84.25%
The average AUC Score is: 88.44%


#### Tuning Hyperparameters of FastText embedding
Method: RandomsearchCV

In [26]:
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

vector_sizes = [100, 200, 300]
windows = [5, 10, 15]
min_counts = [1, 2, 3]
sgs = [0, 1]

best_score = 0
best_params = {}

# Loop over each combination of hyperparameters
for vector_size in vector_sizes:
    for window in windows:
        for min_count in min_counts:
            for sg in sgs:
                # Train FastText model
                model = FastText(sentences, vector_size=vector_size, window=window, min_count=min_count, sg=sg)

                # Create a feature matrix
                features = np.array([np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(vector_size)], axis=0) for sentence in sentences])

                # Reshape the features for LSTM
                features = features.reshape(features.shape[0], 1, features.shape[1])

                # Create LSTM model
                lstm_model = Sequential()
                lstm_model.add(LSTM(100, input_shape=(1, features.shape[2])))
                lstm_model.add(Dense(1, activation='sigmoid'))
                lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

                # Split the data
                X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

                # Fit the model
                lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)

                # Make predictions
                y_pred = (lstm_model.predict(X_test) > 0.5).astype("int32")

                # Calculate recall (sensitivity)
                score = recall_score(y_test, y_pred)

                # If the score is better than the current best score, update the best score and best parameters
                if score > best_score:
                    best_score = score
                    best_params = {'vector_size': vector_size, 'window': window, 'min_count': min_count, 'sg': sg}

print('Best recall score:', best_score)
print('Best parameters for recall:', best_params)

Best recall score: 0.9439421338155516
Best parameters for recall: {'vector_size': 300, 'window': 10, 'min_count': 1, 'sg': 1}


In [28]:
# Tokenize the text (assuming "cleaned_text" contains your preprocessed text data)
sentences = df["Cleaned Text with V lemmatization"].apply(lambda x: x.split())

# Train FastText model
fasttext_model = FastText(sentences, vector_size=300, window=10, min_count=1, sg=1)

# Create a feature matrix
features = np.array([np.mean([fasttext_model.wv[word] for word in sentence if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for sentence in sentences])

In [29]:
y = df['Label']
perform_kfold_cross_validation_with_lstm(features, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 80.81%
The average classification error is: 19.20%
The average sensitivity is: 87.72%
The average precision is: 81.47%
The average specificity is: 70.72%
The average f1 score is: 84.42%
The average AUC Score is: 88.43%


#### Results of Hyperparameter Tuning of Embedding
After tuning the FastText embedding, we saw that there was an improvement in all performance metrics except for precision, that dropped by 0.01%, specificity, that dropped by 0.08%, and f1 score that dropped by 
0.17%
<br> Sensitivity increased from 87.37% to 87.72%. As sensitivity is our main performance metrics, we will be using the tuned FastText embedding for the rest of the notebook.

### Model 5: Doc2Vec

In [19]:
# Create TaggedDocument objects for Doc2Vec
documents = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(df["Cleaned Text with V lemmatization"])]

# Train a Doc2Vec model
doc2vec_model = Doc2Vec(documents, vector_size=300, window=10, min_count=1, epochs=10)

# Create feature vectors for text data
features = np.array([doc2vec_model.dv[i] for i in range(len(documents))])

In [20]:
y = df['Label']
perform_kfold_cross_validation_with_lstm(features, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 73.03%
The average classification error is: 26.97%
The average sensitivity is: 82.88%
The average precision is: 74.58%
The average specificity is: 58.64%
The average f1 score is: 78.48%
The average AUC Score is: 79.34%


### Model 6: ELMo

Column used: "Cleaned Text with N lemmatization"
Model: Logistic Regression
Feature Extraction: ELMo Embedding
Description:

ELMo (Embeddings from Language Models) is a deep contextualized word representation model developed by researchers at the Allen Institute for Artificial Intelligence. Unlike traditional word embeddings, which assign a fixed vector to each word, ELMo generates contextualized word representations that capture various aspects of word meaning and usage based on the specific context in which the word appears.

Pros:
ELMo captures contextual information, allowing for a more nuanced understanding of word meanings based on their surrounding context, which can be beneficial for various downstream NLP tasks.

Cons:
ELMo models can be computationally expensive and may require significant computational resources, making them less feasible for resource-constrained environments. ELMo embeddings also heavily rely on large and diverse datasets for training, which might limit their effectiveness when working with domain-specific or small-scale datasets.

Feature Transformation: nil
Explanation:
Similarly to GloVe word embeddings, ELMo also does not require additional feature transformation steps like TF-IDF, PCA or LDA because the word embeddings are context-sensitive word representations by considering the entire input sentence. It uses a deep, bidirectional language model to compute word embeddings that capture both syntax and semantics, considering the surrounding context. 

Therefore, we can directly use ELMo word embeddings as features for your text classification model without the need for feature transformation steps.

In [23]:
# Creating ELMo Embeddings. [DEEPNOTE NOTEBOOK INSUFFICIENT MEMORY]

# Initialize the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Cleaned Text with V lemmatization'])

# Load the ELMo model outside the loop
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Preprocess all data and convert it to sequences
sequences = tokenizer.texts_to_sequences(df['Cleaned Text with V lemmatization'])
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Process the data in batches
batch_size = 25
num_batches = len(df) // batch_size + 1
all_embeddings = None

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_sequences = padded_sequences[start_idx:end_idx]

    # Extract ELMo embeddings for the batch
    batch_texts = df['Cleaned Text with V lemmatization'][start_idx:end_idx].tolist()
    embedding = elmo.signatures["default"](tf.constant(batch_texts))["default"]
    embeddings_array = embedding.numpy()

    # Store the embeddings
    if all_embeddings is None:
        all_embeddings = embeddings_array
    else:
        all_embeddings = np.concatenate((all_embeddings, embeddings_array), axis=0)

# Save the embeddings to a file
with h5py.File("elmo_embeddingsV.hdf5", "w") as f:
    f.create_dataset("embeddings", data=all_embeddings)

print("ELMo embeddings saved to elmo_embeddings.hdf5.")

ValueError: Trying to load a model of incompatible/unknown type. 'C:\Users\sylve\AppData\Local\Temp\tfhub_modules\58051eb9ff2f7c649b7c541acc518dac54e786ca' contains neither 'saved_model.pb' nor 'saved_model.pbtxt'.

Load ELMo embeddings.

In [24]:
with h5py.File("elmo_embeddingsV.hdf5", "r") as f:
    all_embeddings = f["embeddings"][:]
    # Process the embeddings as needed
    # For example, you can print the shape of the embeddings
    print("Shape of the loaded embeddings:", all_embeddings.shape)

Shape of the loaded embeddings: (14364, 1024)


In [25]:
y = df['Label']
feature_matrix = all_embeddings
perform_kfold_cross_validation_with_lstm(feature_matrix, y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The average accuracy is: 81.88%
The average classification error is: 18.12%
The average sensitivity is: 86.28%
The average precision is: 83.72%
The average specificity is: 75.46%
The average f1 score is: 84.96%
The average AUC Score is: 89.69%


Conclusion

Tuned FastText embedding has the best results in terms of sensitivity(recall), 87.72%, and accuracy, 80.81%

#### Notes:
We did not carry out hyperparameter tuning for GloVe, Word2Vec,Doc2Vec and ELMo embeddings, as we noted that these are all pre-trained embeddings which are already optimised. The benefits from tuning the hyperparameters of the embedding might not be worth the computational complexity and intensity, and we decided to focus more on tuning the actual hyperparameters of the model.

Out of curiosity to explore the effects of the tuning, we trained our own FastText model, and gave a shot in running the hyperparameter tuning for FastText. We saw minor improvements to some perfrmance metrics and a drop in others.

## Hyper Parameter tuning - LSTM

Because we could not get a specific library to work, we manually iterated through different hyperparameters instead of using GridSearchCV or RandomSearchCV.

In [5]:
# Tokenize the text (assuming "cleaned_text" contains your preprocessed text data)
sentences = df["Cleaned Text with V lemmatization"].apply(lambda x: x.split())

# Train FastText model
fasttext_model = FastText(sentences, vector_size=300, window=10, min_count=1, sg=1)

# Create a feature matrix
features = np.array([np.mean([fasttext_model.wv[word] for word in sentence if word in fasttext_model.wv] or [np.zeros(100)], axis=0) for sentence in sentences])
y = df['Label']

Due to memory issues, we exported the hyperparameter tuning results to a csv file and imported it back in to continue with the analysis.

In [8]:
import csv
import os

# Open a CSV file to write the results
file_name = 'hyperparameter_results.csv'

if os.path.isfile(file_name):
    existing_params = set()
    with open(file_name, 'r') as existing_file:
        reader = csv.reader(existing_file)
        next(reader)  # Skip header
        for row in reader:
            existing_params.add(tuple(row[:7]))  # Adjust indices based on your header

    # Find the last processed combination
    counter = len(existing_params)
else:
    counter = 0

with open(file_name, mode='a' if os.path.isfile(file_name) else 'w', newline='') as file:
    writer = csv.writer(file)

    if file.tell() == 0:
        writer.writerow(['Units', 'Activation LSTM', 'Activation Dense', 'Optimizer', 'Learning Rate', 'Epochs', 'Batch Size', 'Avg Recall', 'Avg Accuracy', 'Avg F1-Score'])

    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

    def create_model(units=50, activation_lstm='tanh', activation_dense='sigmoid', optimizer=tf.keras.optimizers.Adam(), learning_rate=0.001):
        model = Sequential()
        model.add(LSTM(units, activation=activation_lstm, input_shape=(vectorized_train.shape[1], vectorized_train.shape[2])))
        model.add(Dense(1, activation=activation_dense))
        model.compile(loss='binary_crossentropy', optimizer=optimizer(learning_rate=learning_rate), metrics=['accuracy'])
        return model

    # Define hyperparameter lists
    units_list = [50, 100, 150]
    activation_lstm_list = ['sigmoid', 'tanh', 'relu']
    activation_dense_list = ['sigmoid', 'tanh', 'relu']
    optimizer_list = [tf.keras.optimizers.Adam, tf.keras.optimizers.RMSprop, tf.keras.optimizers.Adagrad]
    learning_rate_list = [0.001, 0.01, 0.1]
    epochs_list = [5, 10]
    batch_size_list = [32, 64]

    # Define the number of folds
    k_folds = 10
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=460)
    counter = 0

    for units in units_list:
        for activation_lstm in activation_lstm_list:
            for activation_dense in activation_dense_list:
                for optimizer in optimizer_list:
                    for lr in learning_rate_list:
                        for epochs in epochs_list:
                            for batch_size in batch_size_list:
                                counter += 1
                                if counter > len(existing_params):
                                    print('\n Initiating new hyperparameter combination...' + str(counter))
                                    # print out the current combination in 1 line
                                    print(f"Units: {units}, Activation LSTM: {activation_lstm}, Activation Dense: {activation_dense}, Optimizer: {optimizer.__name__}, Learning Rate: {lr}, Epochs: {epochs}, Batch Size: {batch_size}")
                                    # Initialize lists to store fold-wise scores
                                    fold_recall_scores = []
                                    fold_accuracy_scores = []
                                    fold_f1_scores = []
    
                                    for train_index, test_index in kf.split(features, y):
                                        X_train, X_test = features[train_index], features[test_index]
                                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
                                        # Reshape the data to match the LSTM input_shape
                                        vectorized_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
                                        vectorized_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
                                        model = create_model(units=units, activation_lstm=activation_lstm,
                                                             activation_dense=activation_dense, optimizer=optimizer, learning_rate=lr)
                                        model.fit(vectorized_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
                                        y_pred = (model.predict(vectorized_test) > 0.5).astype("int32")
    
                                        # Calculate and store scores for each fold
                                        fold_recall_scores.append(recall_score(y_test, y_pred))
                                        fold_accuracy_scores.append(accuracy_score(y_test, y_pred))
                                        fold_f1_scores.append(f1_score(y_test, y_pred))
    
                                        # Clear session and delete model after each fold
                                        tf.keras.backend.clear_session()
                                        del model
    
                                    # Calculate average scores across all folds
                                    avg_recall = np.mean(fold_recall_scores)
                                    avg_accuracy = np.mean(fold_accuracy_scores)
                                    avg_f1 = np.mean(fold_f1_scores)
    
                                    print(f"Average recall: {avg_recall:.4f}")
                                    print(f"Average accuracy: {avg_accuracy:.4f}")
                                    print(f"Average F1-score: {avg_f1:.4f}")
    
                                    # Write results to CSV
                                    params = [units, activation_lstm, activation_dense, optimizer.__name__, lr, epochs, batch_size, avg_recall, avg_accuracy, avg_f1]
                                    writer.writerow(params)
                                    file.flush()

After tuning the hyperparameters and exporting them into a csv, we will then read the csv and sieve out the best parameters for the highest recall, accuracy and f1-score.

In [9]:
# Read the CSV file into a pandas DataFrame
results_df = pd.read_csv('hyperparameter_results.csv')

# Find the row with the highest recall
highest_recall_row = results_df.loc[results_df['Avg Recall'].idxmax()]

# Find the row with the highest accuracy
highest_accuracy_row = results_df.loc[results_df['Avg Accuracy'].idxmax()]

# Find the row with the highest F1-score
highest_f1_score_row = results_df.loc[results_df['Avg F1-Score'].idxmax()]

# Extract the parameters and scores for each metric
highest_recall_params = highest_recall_row[['Units', 'Activation LSTM', 'Activation Dense', 'Optimizer', 'Learning Rate', 'Epochs', 'Batch Size', 'Avg Recall', 'Avg Accuracy', 'Avg F1-Score']]
highest_accuracy_params = highest_accuracy_row[['Units', 'Activation LSTM', 'Activation Dense', 'Optimizer', 'Learning Rate', 'Epochs', 'Batch Size', 'Avg Recall', 'Avg Accuracy', 'Avg F1-Score']]
highest_f1_score_params = highest_f1_score_row[['Units', 'Activation LSTM', 'Activation Dense', 'Optimizer', 'Learning Rate', 'Epochs', 'Batch Size', 'Avg Recall', 'Avg Accuracy', 'Avg F1-Score']]

# Display the parameters and scores for each metric
print("Highest Recall Parameters and Scores:")
print(highest_recall_params)

print("\nHighest Accuracy Parameters and Scores:")
print(highest_accuracy_params)

print("\nHighest F1-Score Parameters and Scores:")
print(highest_f1_score_params)


Highest Recall Parameters and Scores:
Units                     50
Activation LSTM      sigmoid
Activation Dense     sigmoid
Optimizer            Adagrad
Learning Rate          0.001
Epochs                     5
Batch Size                64
Avg Recall               1.0
Avg Accuracy        0.593498
Avg F1-Score        0.744899
Name: 25, dtype: object

Highest Accuracy Parameters and Scores:
Units                    150
Activation LSTM         tanh
Activation Dense     sigmoid
Optimizer               Adam
Learning Rate           0.01
Epochs                    10
Batch Size                32
Avg Recall          0.869793
Avg Accuracy        0.815162
Avg F1-Score        0.848118
Name: 762, dtype: object

Highest F1-Score Parameters and Scores:
Units                    100
Activation LSTM         relu
Activation Dense     sigmoid
Optimizer               Adam
Learning Rate           0.01
Epochs                    10
Batch Size                64
Avg Recall          0.880834
Avg Accuracy       

#### Conclusion
Highest Recall: 100% <br> Although the recall was able to get a whopping 100%, the accuracy and f1-score took a huge hit and was only 59.34% and 74.45% respectively.<br><br>
Highest Accuracy: 81.15% <br> This combination of hyperparameters performed more consistently across the other performance metrics, getting 86.98% for recall and 84.48% for f1-score.<br><br>
Highest f1-score: 84.48% <br> This combination of hyperparameters also performed consistently across the other performance metrics, gettting 88.08% for recall and 81.32% for accuracy.

#### Fitting the parameters into the initial function we used for hypothesis 2.

In [14]:
def tuned_LSTM(word_embeddings, labels):
    np.random.seed(42)
    random.seed(42)
    tf.random.set_seed(42)

    num_of_folds = 10
    skf = StratifiedKFold(n_splits=num_of_folds, shuffle=True, random_state=460)

    # List of metrics for each fold
    k_fold_accuracy = []
    k_fold_classification_error = []
    k_fold_sensitivity = []
    k_fold_precision = []
    k_fold_specificity = []
    k_fold_f1_score = []
    auc_roc_scores = []
    fpr_values = []
    tpr_values = []

    for train_index, test_index in skf.split(word_embeddings, labels):
        X_train, X_test = word_embeddings[train_index], word_embeddings[test_index]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

        vectorized_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
        vectorized_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

        model = Sequential()
        model.add(LSTM(100, activation = 'relu',  input_shape=(vectorized_train.shape[1], vectorized_train.shape[2])))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

        model.fit(vectorized_train, y_train, epochs=10, batch_size=64, verbose=1)
        y_pred = (model.predict(vectorized_test) > 0.5).astype("int32")

        # Calculating metrics for evaluation
        confusion = confusion_matrix(y_test, y_pred)
        TN, FP, FN, TP = confusion[0, 0], confusion[0, 1], confusion[1, 0], confusion[1, 1]
        accuracy = accuracy_score(y_test, y_pred)
        classification_error = 1 - accuracy
        sensitivity = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        specificity = TN / (TN + FP)
        f1_score = (2 * sensitivity * precision) / (sensitivity + precision)
        y_pred_proba = model.predict(vectorized_test)
        auc_roc_score = roc_auc_score(y_test, y_pred_proba)

        # Appending metrics to lists
        k_fold_accuracy.append(round(accuracy, 4))
        k_fold_classification_error.append(round(classification_error, 4))
        k_fold_sensitivity.append(round(sensitivity, 4))
        k_fold_precision.append(round(precision, 4))
        k_fold_specificity.append(round(specificity, 4))
        k_fold_f1_score.append(round(f1_score, 4))
        auc_roc_scores.append(auc_roc_score)

        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        fpr_values.append(fpr)
        tpr_values.append(tpr)

    print('The average accuracy is: {:.2f}%'.format(statistics.mean(k_fold_accuracy) * 100))
    print('The average classification error is: {:.2f}%'.format(statistics.mean(k_fold_classification_error) * 100))
    print('The average sensitivity is: {:.2f}%'.format(statistics.mean(k_fold_sensitivity) * 100))
    print('The average precision is: {:.2f}%'.format(statistics.mean(k_fold_precision) * 100))
    print('The average specificity is: {:.2f}%'.format(statistics.mean(k_fold_specificity) * 100))
    print('The average f1 score is: {:.2f}%'.format(statistics.mean(k_fold_f1_score) * 100))

    weights = [len(test_index) / len(labels) for _, test_index in skf.split(word_embeddings, labels)]
    avg_auc_roc = np.average(auc_roc_scores, weights=weights, axis=0)
    print('The average AUC Score is: {:.2f}%'.format(avg_auc_roc * 100))

    max_length_fpr = max(len(a) for a in fpr_values)
    max_length_tpr = max(len(a) for a in tpr_values)
    padded_fpr = np.array([np.pad(a, (0, max_length_fpr - len(a)), mode='constant') for a in fpr_values])
    padded_tpr = np.array([np.pad(a, (0, max_length_tpr - len(a)), mode='constant') for a in tpr_values])

    avg_fpr_value = np.average(padded_fpr, weights=weights, axis=0)
    avg_tpr_value = np.average(padded_tpr, weights=weights, axis=0)

    fig = px.area(
        x=avg_fpr_value, y=avg_tpr_value,
        title=f'ROC Curve (AUC={avg_auc_roc:.4f})',
        labels=dict(x='Average False Positive Rate', y='Average True Positive Rate'),
        width=700, height=700
    )

    fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()

    x_axis = [i for i in range(1, num_of_folds + 1)]
    fig = px.scatter(x=x_axis, y=auc_roc_scores,
                     labels={"x": "K-Fold", "y": "AUC Score"},
                     trendline='ols',
                     title='AUC Values for the each K-Fold'
                     )
    fig.show()
tuned_LSTM(features, y)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

#### Conclusion

Comparing our results after hyperparameter tuning, we can see an increase in performance for accuracy from 80.81% to 81.08%, sensitivity from 87.72% to 88.31% and f1-score from 84.42% to 84.69%. <br> We can see that the hyperparameter tuning has helped to improve the performance of our model even though the increase is not very significant.

#### Interesting Observations
Before hypertuning, we expected the best activation function for the LSTM layer to be either sigmoid or Tanh as they help regulate information flow. However, we found out from the hypertuning that ReLU actually gave us the best score in terms of f1-score. We think this might be because ReLU was possibly able to capture more nuance in the data, and thus performed better than the other activation functions. <br>
It is also important to note that when hypertuning for the best accuracy, Tanh came out on top. The performance metrics for this was not too far behind that of ReLU, and we think that further exploration between these two activation functions would be interesting.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=86836953-3695-4d21-9d30-632fe800fb7d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>