<a href="https://colab.research.google.com/github/Tariquzzaman-faisal/VITD/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting to drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
!pip install tensorflow



In [3]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m41.0/68.8 kB[0m [31m978.4 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199771 sha256=6a58f5e49484debf8a103136023849c15628a8522c4a83ddac9bc852b60148d8
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built f

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Input, Flatten, MaxPooling1D, SpatialDropout1D, Activation

from keras.callbacks import EarlyStopping

from numpy import array
from sklearn.metrics import classification_report

import gensim
from gensim import models
from gensim.models import Word2Vec
import fasttext.util
import pandas as pd
import numpy as np

# Loading Model

In [5]:
fasttext_model = fasttext.load_model("/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/notebooks/Tariq/fasttext/model_bn_300.bin")



# Load Dataset

In [6]:
train_dataset = pd.read_csv("/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/dataset/task datasets/original/train.csv")
val_dataset = pd.read_csv("/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/dataset/task datasets/original/dev.csv")
test_dataset = pd.read_csv("/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/dataset/task datasets/original/test.csv")

In [7]:
print(f'train: {train_dataset.shape}\nval: {val_dataset.shape}\ntest: {test_dataset.shape}')

train: (2700, 2)
val: (1330, 2)
test: (2016, 2)


# Oversampling

In [8]:
train_dataset['label'].value_counts()

0    1389
1     922
2     389
Name: label, dtype: int64

In [9]:
# Find the maximum class frequency
max_class_frequency = train_dataset['label'].value_counts().max()

# Group the dataset by labels
grouped = train_dataset.groupby('label')

resampled_data = []
for label, group in grouped:
    if len(group) < max_class_frequency:
        oversampled_group = group.sample(max_class_frequency, replace=True, random_state=42)
        resampled_data.append(oversampled_group)
    else:
        resampled_data.append(group)

# Concatenate the resampled groups to create the balanced dataset
balanced_dataset = pd.concat(resampled_data)

# Shuffle the dataset to ensure randomness
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
balanced_dataset.shape

(4167, 2)

In [11]:
balanced_dataset['label'].value_counts()

1    1389
0    1389
2    1389
Name: label, dtype: int64

In [12]:
train_dataset = balanced_dataset
train_dataset['label'].value_counts()

1    1389
0    1389
2    1389
Name: label, dtype: int64

In [13]:
train_x = train_dataset['text']
train_y = train_dataset['label']

val_x = val_dataset['text']
val_y = val_dataset['label']

test_x = test_dataset['text']
test_y = test_dataset['label']

# Embedding Setup

In [14]:
tokenizer=Tokenizer(oov_token = "<OOV>", split=' ') # Splitting text based on whitespace and adding "Out of vocabulary"
tokenizer.fit_on_texts(train_x) # Using the tokenizer on out train dataset to tokenize the train dataset
train_encoded=tokenizer.texts_to_sequences(train_x)
print(train_encoded)


[[65, 628, 601, 42, 1099, 135, 713, 200, 20, 1327, 26, 274, 1099, 1196, 1009, 482, 30, 148, 2, 3, 53, 43, 714, 3, 13, 1994, 21, 25, 803, 118, 80, 85, 31, 1995, 1738, 1996, 804, 1997, 343, 483, 4, 1998, 871, 1999], [577, 70], [77, 9, 64, 3012, 3013, 7, 49, 423, 197], [2380, 5400, 5401], [90, 925, 16, 10, 715, 86, 32, 275, 429, 159, 92, 602, 113, 715, 86, 2000, 805, 243, 32, 92, 602, 113, 654, 86, 394, 1197, 16, 10, 484, 424, 395, 5, 153, 328, 78, 179, 45, 231, 64, 485, 872, 117, 1517, 716, 2001], [135, 64, 255, 40, 93, 344, 78, 179, 717, 104, 135, 180, 1739, 181, 65, 2002, 135, 64, 182, 136, 93, 45, 76, 189, 5402, 5403, 1198, 5404, 21, 49, 1740, 31, 3900, 276, 82, 806, 3901, 40, 2003, 70, 79, 1518, 5405, 5406], [20, 2381, 2382, 629, 3902, 7, 22], [35, 194, 42, 243, 2, 873, 873, 50, 2383, 2004, 32, 2384, 119, 30, 65, 35], [35, 194, 42, 243, 2, 873, 873, 50, 2383, 2004, 32, 2384, 119, 30, 65, 35], [3014, 15, 1741, 126, 10, 108, 552, 126, 10], [114, 36, 131, 35], [72, 329, 807, 154, 2005, 

In [15]:
train_padded= pad_sequences(train_encoded, padding='post')
print(train_padded)

[[  65  628  601 ...    0    0    0]
 [ 577   70    0 ...    0    0    0]
 [  77    9   64 ...    0    0    0]
 ...
 [ 134  131   73 ...    0    0    0]
 [ 378   82  478 ...    0    0    0]
 [4188 9428  571 ...    0    0    0]]


In [16]:
# padding df_validation
test_encoded=tokenizer.texts_to_sequences(test_x)
test_padded= pad_sequences(test_encoded, padding='post', maxlen=train_padded.shape[1])

In [17]:
# function that takes word vector as input and returned an embedding layer
def embedding_creation(EMBEDDING_DIM, word_vectors):
  vocabulary_size=len(tokenizer.word_index)+1
  word_index=tokenizer.word_index
  embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

  for word, i in word_index.items():
    try:
      embedding_vector=word_vectors[word] # taking the word vector of all the words in the index
      embedding_matrix[i]=embedding_vector # inserting the vector of the word to the embeddings matrix,  index wise
    except KeyError:
      embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
      """
      The strategy of generating random vectors for missing words (KeyError)
      in the embedding matrix is useful because it provides a way to
      include out-of-vocabulary words in the representation,
        prevents loss of information, helps with stable training, and
        ensures a complete embedding matrix for neural network models.
      """
  embedding_layer=Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

  return embedding_layer

In [18]:
EMBEDDING_DIM = 300
wv = fasttext_model
IFT = embedding_creation(EMBEDDING_DIM, wv)
# gets the embedding layer from the word vectors using EMBEDDING_DIM as dim size

In [19]:
max_length = train_padded.shape[1]
vocabulary_size = len(tokenizer.word_index) + 1
# creating a randomly initialized embedding layer (RE)
RE = Embedding(vocabulary_size, EMBEDDING_DIM,input_length = max_length, trainable=True)

# Early Stopping

In [20]:
earlystop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=1,
    mode="min",
    restore_best_weights=True,
)

In [21]:
emb_X_name_collection = [ [IFT, 'IFT']]
"""
IFT = embedding_creation(EMBEDDING_DIM, wv)
# IFT has the embedding layer from the word vectors using EMBEDDING_DIM as dim size
"""

'\nIFT = embedding_creation(EMBEDDING_DIM, wv)\n# IFT has the embedding layer from the word vectors using EMBEDDING_DIM as dim size\n'

# Model Configuration Orignal

In [22]:
!pip install keras



In [23]:
from tensorflow.keras.layers import Layer, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dense
from tensorflow.keras.models import Sequential
import tensorflow.keras.backend as K


In [24]:
# Define the custom attention mechanism as a subclass of Layer
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
                                 initializer='random_normal', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W))  # Calculate alignment scores
        alpha = K.softmax(e, axis=1)   # Compute attention weights
        weighted_sum = x * alpha       # Apply attention to input
        return K.sum(weighted_sum, axis=1)


# With attention

In [25]:
num_classes = 3
# Loop through each embedding layer and create models with attention
for emb_X_name in emb_X_name_collection:
    model = Sequential([
        emb_X_name[0],
        LSTM(100, dropout=0.3, return_sequences=True),
        AttentionLayer(),  # Use the custom attention layer
        Dense(16, activation='relu'),
        Dense(num_classes, activation='softmax'),  # Use softmax for multi-class classification
    ],
    name="Sentiment_Model")
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# Without attention

In [26]:
# num_classes = 3
# for emb_X_name in emb_X_name_collection:
#     model = Sequential([
#         emb_X_name[0],
#         Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
#         GlobalMaxPool1D(),
#         Dense(16, activation='relu'),
#         Dense(num_classes, activation='softmax'),  # Use softmax for multi-class classification
#     ],
#     name="Sentiment_Model")
#     model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Use sparse_categorical_crossentropy

# Training

In [27]:
history = model.fit(train_padded, train_y, epochs=100, batch_size=64, validation_data=(test_padded, test_y), callbacks=[earlystop_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping


In [28]:
model.save("/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/notebooks/Tariq/fasttext/fastext.h5")  # Save the model in an h5 format

In [29]:
prediction = model.predict(test_padded)

p = []
for i in range(len(prediction)):
    a = []
    for j in range(3):
        a.append(round(prediction[i][j]))
    p.append(a)




In [30]:
prediction

array([[0.27748552, 0.6425631 , 0.07995132],
       [0.73199844, 0.11609846, 0.15190317],
       [0.8659142 , 0.05872602, 0.07535968],
       ...,
       [0.03312075, 0.01820435, 0.94867486],
       [0.02671758, 0.01915994, 0.9541225 ],
       [0.806994  , 0.08840309, 0.10460278]], dtype=float32)

In [31]:
# Determine the maximum column index for each row
max_indices = np.argmax(p, axis=1)

# Create a DataFrame with the max_indices
pred_labels = pd.DataFrame({'Value': max_indices})

In [32]:
pred_labels

Unnamed: 0,Value
0,1
1,0
2,0
3,1
4,1
...,...
2011,0
2012,1
2013,2
2014,2


In [33]:
from sklearn.metrics import classification_report

target_names = ['neutral', 'passive', 'active']
r = classification_report(test_y, pred_labels, output_dict=True)

In [34]:
r

{'0': {'precision': 0.7747408105560791,
  'recall': 0.75,
  'f1-score': 0.7621696801112656,
  'support': 1096},
 '1': {'precision': 0.6653116531165312,
  'recall': 0.6828929068150209,
  'f1-score': 0.673987645847632,
  'support': 719},
 '2': {'precision': 0.5576036866359447,
  'recall': 0.6019900497512438,
  'f1-score': 0.5789473684210527,
  'support': 201},
 'accuracy': 0.7113095238095238,
 'macro avg': {'precision': 0.665885383436185,
  'recall': 0.6782943188554217,
  'f1-score': 0.6717015647933168,
  'support': 2016},
 'weighted avg': {'precision': 0.7140641607014254,
  'recall': 0.7113095238095238,
  'f1-score': 0.7124521368149932,
  'support': 2016}}

In [35]:
df = pd.DataFrame(r)

# Transpose the DataFrame
df = df.transpose()

print(df)

              precision    recall  f1-score     support
0              0.774741  0.750000  0.762170  1096.00000
1              0.665312  0.682893  0.673988   719.00000
2              0.557604  0.601990  0.578947   201.00000
accuracy       0.711310  0.711310  0.711310     0.71131
macro avg      0.665885  0.678294  0.671702  2016.00000
weighted avg   0.714064  0.711310  0.712452  2016.00000
