In [None]:
from google.colab import drive
drive.mount('/content/MyDrive/')

In [None]:
!pip install transformers
!pip install datasets

In [None]:
import transformers
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification
import tensorflow as tf
import pandas as pd
import json
import gc
from sklearn.model_selection import train_test_split
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot
from tqdm import tqdm

In [None]:
DATA_PATH = "/content/MyDrive/MyDrive/dataset.csv"

In [None]:
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# check the size of the data
print("Train data size \n", df.shape )

In [None]:
print(df.shape)
df = df[df['Relevance']!= 0]
print(df.shape)

In [None]:
# reset the indexing after droping the rows
df.reset_index(inplace=True)

In [None]:
df['Stance'] = pd.to_numeric(df['Stance'],downcast='integer')

In [None]:
# only for multiclass
for i in range(df.shape[0]):
  if(df['Stance'][i]==3):
    df['Stance'][i]=0

In [None]:
df.shape

In [None]:
df

In [None]:
df['count'] = df['tweet_cleaned'].apply(lambda x: len(x.split()))

In [None]:
plt.figure(figsize= (8, 8))
sns.displot(df['count'])
plt.xlim(0, 60)
plt.xlabel('The num of words ', fontsize = 16)
plt.title("The Number of Words Distribution", fontsize = 18)
plt.show()

In [None]:
category_count = df['Stance'].value_counts()
categories = category_count.index
categories

In [None]:
fig = plt.figure(figsize= (12, 5))
ax = fig.add_subplot(111)
sns.barplot(x = category_count.index, y = category_count )
for a, p in enumerate(ax.patches):
    ax.annotate(f'{categories[a]}\n' + format(p.get_height(), '.0f'), xy = (p.get_x() + p.get_width() / 2.0, p.get_height()), xytext = (0,-25), size = 13, color = 'white' , ha = 'center', va = 'center', textcoords = 'offset points', bbox = dict(boxstyle = 'round', facecolor='none',edgecolor='white', alpha = 0.5) )

plt.xlabel('Categories', size = 15)
plt.ylabel('The Number of Tweets', size= 15)
plt.xticks(size = 12)
plt.title("The number of Tweets by Categories" , size = 18)
plt.show()

In [None]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_pandas(df)
ds

In [None]:
train_test_valid = ds.train_test_split(test_size =0.3)
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
train_test_valid_dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
    })

ds = train_test_valid_dataset
ds

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
print(f"Vocab size is : {tokenizer.vocab_size}")
print(f"Model max length is : {tokenizer.model_max_length}")
print(f"Model input names are: {tokenizer.model_input_names}")

In [None]:
tokenizer.model_max_length = 256
tokenizer.model_max_length

In [None]:
def tokenize_function(train_dataset):
    return tokenizer(train_dataset['tweet_cleaned'], padding='max_length', truncation=True)

tokenized_dataset = ds.map(tokenize_function, batched=True)
tokenized_dataset

train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['valid']
test_dataset = tokenized_dataset['test']

In [None]:
train_dataset

In [None]:
to_remove = ['Tweet', "tweet_cleaned", 'Relevance', 'Hate', 'Direction', 'Targets', 'Humor', 'count']
train_set = train_dataset.remove_columns(to_remove).with_format('tensorflow')
tf_eval_dataset = eval_dataset.remove_columns(to_remove).with_format('tensorflow')
tf_test_dataset = test_dataset.remove_columns(to_remove).with_format('tensorflow')

In [None]:
train_set

In [None]:
train_features = { x: train_set[x] for x in tokenizer.model_input_names  }
train_set_for_final_model = tf.data.Dataset.from_tensor_slices((train_features, train_set['Stance'] ))
train_set_for_final_model = train_set_for_final_model.shuffle(len(train_set)).batch(16)

eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
val_set_for_final_model = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["Stance"]))
val_set_for_final_model = val_set_for_final_model.batch(16)

test_features = {x: tf_test_dataset[x] for x in tokenizer.model_input_names}
test_set_for_final_model = tf.data.Dataset.from_tensor_slices((test_features, tf_test_dataset["Stance"]))
test_set_for_final_model =test_set_for_final_model.batch(16)

In [None]:
train_set['Stance']

In [None]:
train_dataset['Stance']

In [None]:
pip install livelossplot

In [None]:
import keras
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow as tf

In [None]:
from livelossplot import PlotLossesKeras
callbacks = [PlotLossesKeras()]

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=3)

In [None]:
# setting up class weights for imbalanced data

import numpy as np
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced'
                                               ,classes = np.unique(train_dataset['Stance'])
                                               ,y = train_dataset['Stance'])

In [None]:
class_weights = dict(zip(np.unique(train_dataset['Stance']), class_weight))
class_weights

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.summary()

In [None]:
history = model.fit(train_set_for_final_model, validation_data=val_set_for_final_model, epochs=3, class_weight=class_weights,callbacks=[callbacks],verbose=1 )

In [None]:
test_loss, test_acc = model.evaluate(test_set_for_final_model,verbose=1)
print('\nTest accuracy:', test_acc)

In [None]:
preds = model.predict(test_set_for_final_model,verbose=1)

In [None]:
import numpy as np
preds.logits

for i in range(len(preds['logits'])):
  p = np.argmax(preds['logits'][i])
  print(p)

In [None]:
len(preds['logits'])

In [None]:
test_labels = test_valid['test']['Stance']

In [None]:
preds_labels = []
for i in range(len(preds['logits'])):
  p = np.argmax(preds['logits'][i])
  preds_labels.append(p)

In [None]:
print(np.array(preds_labels).shape)
print(np.array(test_labels).shape)

In [None]:
preds_labels

In [None]:
test_labels

In [None]:


from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report
from imblearn.metrics import macro_averaged_mean_absolute_error

print(" ")
print("Stance Bert")
print(" ")

print("Accuracy -> ",accuracy_score(preds_labels, test_labels))
print("F1 score -> ",f1_score(preds_labels, test_labels,average='macro'))
print("Macro MAE -> ",macro_averaged_mean_absolute_error(preds_labels, test_labels))
print("Recall -> ",recall_score(preds_labels, test_labels,average='macro'))
print("Precision -> ",precision_score(preds_labels, test_labels,average='macro'))
print(confusion_matrix(preds_labels, test_labels))

print(classification_report(preds_labels, test_labels))