__Objective__: Classify tweets as ad hominem and non-ad hominem using finetuned Bertweet model

__Runtime__: GPU

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!pip install transformers

# to replace empji with emoji texts
!pip3 install nltk emoji

In [None]:
from transformers import pipeline, RobertaForSequenceClassification, AutoTokenizer
import torch

In [None]:
from tqdm import tqdm
import pickle
import pandas as pd 
import numpy as np 

In [None]:
model = RobertaForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/DL/models/bertweet')
tokenizer = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/DL/models/bertweet', normalization=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

In [None]:
# creating pytorch pipeline for lightening fast inferencing
pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, device=0)

In [None]:
def classify(text, **kwargs):
    return pipe(text, max_length=128, truncation=True, **kwargs)

In [None]:
before_df = pd.read_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/processed/nytimes_before.json', lines=True)
after_df = pd.read_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/processed/nytimes_after.json', lines=True)

In [None]:
before_df['augmented_tweet'] = before_df['parent_tweet'] + before_df['tweet']
after_df['augmented_tweet'] = after_df['parent_tweet'] + after_df['tweet']

In [None]:
# from tqdm._tqdm_notebook import tqdm_notebook
# tqdm_notebook.pandas()

In [None]:
tweets_before_1 = list(before_df['tweet'])
tweets_before_2 = list(before_df['augmented_tweet'])

tweets_after_1 = list(after_df['tweet']) 
tweets_after_2 = list(after_df['augmented_tweet'])

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __getitem__(self, idx):
        return self.texts[idx]

    def __len__(self):
        return len(self.texts)

In [None]:
dataset_tb1 = CustomDataset(tweets_before_1)
dataset_tb2 = CustomDataset(tweets_before_2)
dataset_ta1 = CustomDataset(tweets_after_1)
dataset_ta2 = CustomDataset(tweets_after_2)

In [None]:
labels_tb1 = list()
labels_tb2 = list()
labels_ta1 = list()
labels_ta2 = list()

In [None]:
for out in tqdm(classify(dataset_tb1, batch_size=128), total=len(dataset_tb1)):
    labels_tb1.append(out)

In [None]:
with open('/content/gdrive/MyDrive/Temp/39_labels_tb1.pkl', 'wb') as f:
    pickle.dump(labels_tb1, f)

In [None]:
for out in tqdm(classify(dataset_tb2, batch_size=128), total=len(dataset_tb2)):
    labels_tb2.append(out)

In [None]:
with open('/content/gdrive/MyDrive/Temp/39_labels_tb2.pkl', 'wb') as f:
    pickle.dump(labels_tb2, f)

In [None]:
for out in tqdm(classify(dataset_ta1, batch_size=64), total=len(dataset_ta1)):
    labels_ta1.append(out)

In [None]:
with open('/content/gdrive/MyDrive/Temp/39_labels_ta1.pkl', 'wb') as f:
    pickle.dump(labels_ta1, f)

In [None]:
for out in tqdm(classify(dataset_ta2, batch_size=64), total=len(dataset_ta2)):
    labels_ta2.append(out)

In [None]:
with open('/content/gdrive/MyDrive/Temp/39_labels_ta2.pkl', 'wb') as f:
    pickle.dump(labels_ta2, f)

In [None]:
# loading labels and scores

with open('/content/gdrive/MyDrive/Temp/39_labels_tb1.pkl', 'rb') as f:
    labels_tb1 = pickle.load(f)

with open('/content/gdrive/MyDrive/Temp/39_labels_tb2.pkl', 'rb') as f:
    labels_tb2 = pickle.load(f)

with open('/content/gdrive/MyDrive/Temp/39_labels_ta1.pkl', 'rb') as f:
    labels_ta1 = pickle.load(f)

with open('/content/gdrive/MyDrive/Temp/39_labels_ta2.pkl', 'rb') as f:
    labels_ta2 = pickle.load(f)

In [None]:
labels_tb1[0]

In [None]:
labels_tb1_labels = list()
labels_tb1_scores = list()

for x in labels_tb1:
    xlabel = 'NONE'
    if x['label'] == 'LABEL_1':
        xlabel = 'AH'
    labels_tb1_labels.append(xlabel)
    labels_tb1_scores.append(x['score'])

In [None]:
labels_tb1_labels_s = pd.Series(labels_tb1_labels)
labels_tb1_scores_s = pd.Series(labels_tb1_scores)

before_df['label'] = labels_tb1_labels_s
before_df['score'] = labels_tb1_scores_s

In [None]:
labels_tb2_labels = list()
labels_tb2_scores = list()

for x in labels_tb2:
    xlabel = 'NONE'
    if x['label'] == 'LABEL_1':
        xlabel = 'AH'
    labels_tb2_labels.append(xlabel)
    labels_tb2_scores.append(x['score'])

In [None]:
labels_tb2_labels_s = pd.Series(labels_tb2_labels)
labels_tb2_scores_s = pd.Series(labels_tb2_scores)

before_df['label_augmented'] = labels_tb2_labels_s
before_df['score_augmented'] = labels_tb2_scores_s

In [None]:
labels_ta1_labels = list()
labels_ta1_scores = list()

for x in labels_ta1:
    xlabel = 'NONE'
    if x['label'] == 'LABEL_1':
        xlabel = 'AH'
    labels_ta1_labels.append(xlabel)
    labels_ta1_scores.append(x['score'])

labels_ta1_labels_s = pd.Series(labels_ta1_labels)
labels_ta1_scores_s = pd.Series(labels_ta1_scores)

after_df['label'] = labels_ta1_labels_s
after_df['score'] = labels_ta1_scores_s

In [None]:
labels_ta2_labels = list()
labels_ta2_scores = list()

for x in labels_ta2:
    xlabel = 'NONE'
    if x['label'] == 'LABEL_1':
        xlabel = 'AH'
    labels_ta2_labels.append(xlabel)
    labels_ta2_scores.append(x['score'])

labels_ta2_labels_s = pd.Series(labels_ta2_labels)
labels_ta2_scores_s = pd.Series(labels_ta2_scores)

after_df['label_augmented'] = labels_ta2_labels_s
after_df['score_augmented'] = labels_ta2_scores_s

In [None]:
before_df.to_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/classified/nytimes_before.json', lines=True, orient="records")
after_df.to_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/classified/nytimes_after.json', lines=True, orient="records")

In [None]:
after_df.columns

In [None]:
before_df['score'].hist(bins=[x/100 for x in range(101)])