<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Paper(1_s2_0_S0957417422020255_main)_Bangla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas tensorflow keras gensim nltk scikit-learn openpyxl



In [2]:
# Step 2: Import Libraries
import numpy as np
import pandas as pd
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Dropout, Input, Bidirectional, Attention, GlobalAveragePooling1D, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import os

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:


# Step 3: Load and Preprocess Data
# Load the Excel file
from google.colab import files
uploaded = files.upload()  # Manually upload your .xlsx file

# Change the file name as per the uploaded file
df = pd.read_excel(next(iter(uploaded.keys())))  # Replace with 'filename.xlsx' if needed


Saving Bangla2_translated.xlsx to Bangla2_translated.xlsx


In [19]:
# Define the text and label columns
text_column_name = 'tweets_english'  # replace with actual column name for text
label_column_name = 'labels'  # replace with actual column name for labels

In [20]:
# Preprocess text data
def preprocess_text(text):
    tokens = word_tokenize(str(text).lower())
    words = [word for word in tokens if word.isalpha()]
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

df[text_column_name] = df[text_column_name].apply(preprocess_text)

# Step 4: Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(df[text_column_name], df[label_column_name], test_size=0.2, random_state=42)



In [7]:

from google.colab import drive
drive.mount('/content/drive')
# Load FastText embeddings
# Path to save the model in Google Drive
save_path = '/content/drive/MyDrive/embeddings/cc.en.300.model'

Mounted at /content/drive


In [8]:
# os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Now save the model
# fasttext_model.save(save_path)
fasttext_model = gensim.models.KeyedVectors.load('/content/drive/MyDrive/embeddings/cc.en.300.model')

In [10]:
glove_model = gensim.models.KeyedVectors.load('/content/drive/MyDrive/embeddings/glove.6B.300d.txt')


In [15]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    '/content/drive/MyDrive/embeddings/GoogleNews-vectors-negative300.bin', binary=True
)

In [21]:


def get_embedding_matrix(word_index, embedding_model, embedding_dim=300):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if word in embedding_model:
            embedding_matrix[i] = embedding_model[word]
    return embedding_matrix

# Step 6: Prepare Input Data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

padded_train = pad_sequences(train_sequences, maxlen=100)
padded_test = pad_sequences(test_sequences, maxlen=100)
word_index = tokenizer.word_index

# Step 7: Build the Hybrid Model
embedding_dim = 300

# Input layer
input_layer = Input(shape=(100,))

# Embedding layers for each embedding type
embedding_layer_w2v = Embedding(len(word_index) + 1, embedding_dim, weights=[get_embedding_matrix(word_index, word2vec_model)], trainable=False)(input_layer)
embedding_layer_ft = Embedding(len(word_index) + 1, embedding_dim, weights=[get_embedding_matrix(word_index, fasttext_model)], trainable=False)(input_layer)
embedding_layer_glove = Embedding(len(word_index) + 1, embedding_dim, weights=[get_embedding_matrix(word_index, glove_model)], trainable=False)(input_layer)

# Concatenate embeddings
concatenated_embeddings = Concatenate()([embedding_layer_w2v, embedding_layer_ft, embedding_layer_glove])

In [22]:
from keras.models import Model
from keras.layers import Input, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Attention
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Assuming `input_layer` and `concatenated_embeddings` are already defined
# Layer definitions
bilstm_out = Bidirectional(LSTM(100, return_sequences=True))(concatenated_embeddings)
lstm_out = LSTM(100, return_sequences=True)(bilstm_out)
cnn_out = Conv1D(filters=50, kernel_size=3, activation='relu')(lstm_out)

attention_out = Attention()([cnn_out, cnn_out])  # Adjusted for attention layer

# Global max pooling and dense layers
max_pool_out = GlobalMaxPooling1D()(attention_out)  # Applying GlobalMaxPooling1D after attention
dropout_layer_1 = Dropout(0.5)(max_pool_out)
dense_layer_1 = Dense(250, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(0.5)(dense_layer_1)
output_layer = Dense(1, activation='sigmoid')(dropout_layer_2)  # Use sigmoid for binary classification

# Define and compile model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Use binary crossentropy for binary classification
model.summary()

# Step 8: Train the Model
model.fit(padded_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Step 9: Evaluate the Model
predictions = model.predict(padded_test)
predicted_classes = (predictions > 0.5).astype("int32")  # Get the predicted class indices based on a threshold
print("Accuracy:", accuracy_score(y_test, predicted_classes))
print("AUC:", roc_auc_score(y_test, predictions))  # For binary classification, use the predictions directly
print("Classification Report:\n", classification_report(y_test, predicted_classes))


Epoch 1/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.7235 - loss: 0.5911 - val_accuracy: 0.7611 - val_loss: 0.4773
Epoch 2/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.7430 - loss: 0.4985 - val_accuracy: 0.7580 - val_loss: 0.4719
Epoch 3/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.7716 - loss: 0.4690 - val_accuracy: 0.7739 - val_loss: 0.4226
Epoch 4/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.7960 - loss: 0.4092 - val_accuracy: 0.7739 - val_loss: 0.4431
Epoch 5/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.8220 - loss: 0.3596 - val_accuracy: 0.7930 - val_loss: 0.4266
Epoch 6/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.8425 - loss: 0.3436 - val_accuracy: 0.7994 - val_loss: 0.3917
Epoch 7/10
[1m89/89[0m [32m━━━━