In [1]:
!pip install transformers==3.0.2



In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
# Setting up GPU

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print (device)

cuda


In [4]:
df = pd.read_csv('Harassment_Cleaned_tweets.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Key Word,Username,User_ID,Datetime,Favorite_count,Geo,Coordinates,Label,Text,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,704,ass,DeborahParr,1.33e+18,11-11-2020 06:56,0,,,1,He’d have my phone wedged up his ass sideways.,,,
1,1915,boobies,MaxZorin85,1.33e+18,11-11-2020 07:35,4,,,0,Yep 100% agree and the same with severine in s...,,,
2,2856,eat pussy,PRISJ1_,1.33e+18,11-11-2020 10:36,0,,,1,Stop having sex with men that won’t eat your p...,,,
3,2163,Breast Man,Teresamckenzy1,1.33e+18,10-11-2020 20:52,0,,,1,"When you see a sad man, just give him breast t...",,,
4,2852,eat pussy,sj__vazquez,1.33e+18,11-11-2020 10:42,0,,,1,We can't be together if you don't eat pussy,,,


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,User_ID,Favorite_count,Label
count,3604.0,3604.0,3604.0,3604.0
mean,1801.632908,1.329972e+18,1.429245,0.547447
std,1040.729184,3991290000000000.0,10.752237,0.497813
min,0.0,1.32e+18,0.0,0.0
25%,900.75,1.33e+18,0.0,0.0
50%,1801.5,1.33e+18,0.0,1.0
75%,2702.25,1.33e+18,1.0,1.0
max,3604.0,1.35e+18,396.0,1.0


In [6]:
#Preparing Dataset and Dataloader

# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2', truncation=True, do_lower_case=True)

In [7]:
class TweetData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Text
        self.targets = self.data.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [8]:
train_size = 0.8
train_data=df.sample(frac=train_size,random_state=200)
test_data=df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = TweetData(train_data, tokenizer, MAX_LEN)
testing_set = TweetData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (3604, 13)
TRAIN Dataset: (2883, 13)
TEST Dataset: (721, 13)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
#Base ALBERT model
class ALBERTClass(torch.nn.Module):
    def __init__(self):
        super(ALBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("albert-base-v2")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [12]:
model = ALBERTClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=47376696.0, style=ProgressStyle(descrip…




ALBERTClass(
  (l1): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
            

In [15]:
#Finetuning Albert model

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [16]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [17]:
# Defining the training function on the 80% of the dataset

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [18]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

1it [00:00,  3.77it/s]

Training Loss per 5000 steps: 1.4438475370407104
Training Accuracy per 5000 steps: 62.5


361it [01:24,  4.29it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 0: 71.2799167533819
Training Loss Epoch: 0.5888301920032237
Training Accuracy Epoch: 71.2799167533819
Training Loss per 5000 steps: 0.4191817343235016
Training Accuracy per 5000 steps: 75.0


361it [01:24,  4.29it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 1: 82.13666319805758
Training Loss Epoch: 0.4068267911858341
Training Accuracy Epoch: 82.13666319805758
Training Loss per 5000 steps: 0.5928189754486084
Training Accuracy per 5000 steps: 75.0


361it [01:24,  4.29it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 2: 88.4148456468956
Training Loss Epoch: 0.29671026669298184
Training Accuracy Epoch: 88.4148456468956
Training Loss per 5000 steps: 0.32403868436813354
Training Accuracy per 5000 steps: 87.5


361it [01:24,  4.29it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 3: 91.6406520985085
Training Loss Epoch: 0.21257639605644832
Training Accuracy Epoch: 91.6406520985085
Training Loss per 5000 steps: 0.2618822455406189
Training Accuracy per 5000 steps: 87.5


361it [01:24,  4.29it/s]

The Total Accuracy for Epoch 4: 95.03988900450919
Training Loss Epoch: 0.14850053946562422
Training Accuracy Epoch: 95.03988900450919





In [19]:
#Testing the trained model

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [20]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

5it [00:00, 19.90it/s]

Validation Loss per 100 steps: 0.14967863261699677
Validation Accuracy per 100 steps: 100.0


181it [00:08, 21.54it/s]

Validation Loss Epoch: 0.553522787617715
Validation Accuracy Epoch: 81.55339805825243
Accuracy on test data = 81.55%



