In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your datasets
bjp = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine the datasets
df = pd.concat([bjp, congress])

# Fill missing values with an empty string
df['commentText'] = df['commentText'].fillna('')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)

# Split the dataset into train, validation, and test sets
df_train, temp_df = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(temp_df, test_size=0.5, random_state=42)

# Define preprocessing functions
def clean_html(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def remove_links(text):
    return re.sub(r'http\S+|www.\S+', '', text)

def non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)

def lower(text):
    return text.lower()

def email_address(text):
    return re.sub(r'\S*@\S*\s?', '', text)

def removeStopWords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

def punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_(text):
    return re.sub('_', '', text)

# Apply preprocessing functions
df_train['commentText'] = df_train['commentText'].apply(clean_html).apply(remove_links).apply(non_ascii).apply(lower).apply(email_address).apply(removeStopWords).apply(punct).apply(remove_)
df_val['commentText'] = df_val['commentText'].apply(clean_html).apply(remove_links).apply(non_ascii).apply(lower).apply(email_address).apply(removeStopWords).apply(punct).apply(remove_)

print(df_train.shape, df_test.shape, df_val.shape)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(2370, 3) (790, 3) (790, 3)


In [2]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertModel 

# Assuming df_train and df_test are your training and testing dataframes
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

max_len = 70

X_train = tokenizer(
    text=df_train['commentText'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

X_test = tokenizer(
    text=df_test['commentText'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)


  from .autonotebook import tqdm as notebook_tqdm






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [3]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertModel

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

embeddings = bert(input_ids, attention_mask=input_mask)[0]
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)

# Assuming the number of classes is 6, modify it according to your dataset
num_classes = 6
y = Dense(num_classes, activation='softmax')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    clipnorm=1.0
)

loss = CategoricalCrossentropy(from_logits=False)  # Ensure from_logits is False for softmax
metric = CategoricalAccuracy(name='accuracy')

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[metric]
)

# One-hot encode target data
y_train = to_categorical(df_train['Label'], num_classes=num_classes)
y_test = to_categorical(df_test['Label'], num_classes=num_classes)

# Learning rate decay callback example (you can customize it based on your needs)
def lr_schedule(epoch):
    return 5e-05 * 0.95**epoch

lr_scheduler = LearningRateScheduler(lr_schedule)

history = model.fit(
    x={'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
    y=y_train,
    validation_data=({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},
                     y_test),
    epochs=1,
    batch_size=32,
    callbacks=[lr_scheduler]
)





In [6]:
from sklearn.metrics import classification_report

predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']})
y_predicted = np.argmax(predicted, axis=1)
print(classification_report(df_test['Label'], y_predicted))

              precision    recall  f1-score   support

           0       0.59      0.98      0.74       609
           1       0.92      0.29      0.44       576

    accuracy                           0.64      1185
   macro avg       0.75      0.63      0.59      1185
weighted avg       0.75      0.64      0.59      1185

