<a href="https://colab.research.google.com/github/txin-y/23springNeuralNetworkProject/blob/main/TweetTextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [2]:
!pip install torchdata==0.5.1 transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata==0.5.1
  Downloading torchdata-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting torch==1.13.1
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[2K

In [4]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
# df = pd.read_csv("twitter_sentiment_analysis.csv")
df = pd.read_csv("/content/train.csv")
df.to_csv("/content/train.csv", index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Twitter Sentiment Analysis**
Detecting hatred tweets, provided by Analytics Vidhya
https://www.kaggle.com/datasets/arkhoshghalb/twitter-sentiment-analysis-hatred-speech

## About Dataset

### Context

The objective of this task is to detect hate speech in tweets. For the sake of simplicity, we say a tweet contains hate speech if it has a racist or sexist sentiment associated with it. So, the task is to classify racist or sexist tweets from other tweets.

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.

### Content

Full tweet texts are provided with their labels for training data.
Mentioned users' username is replaced with @user.

### Acknowledgements

Dataset is provided by Analytics Vidhya

In [35]:
# Define the dataset class
class TwitterDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]["tweet"]
        sentiment = self.df.iloc[index]["label"]
        sequence = self.tokenizer.encode(text, max_length=self.max_length, padding="max_length", truncation=True)
        return {
            "input_ids": torch.tensor(sequence, dtype=torch.long),
            "labels": torch.tensor(sentiment, dtype=torch.float)
        }

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


# Tokenize the text
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [61]:
# Define the model
class LSTMModel(nn.Module):
    def __init__(self, embedding_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(len(tokenizer), embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, 2)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_output, _ = self.lstm(embedded)
        pooled = lstm_output[:, -1, :]
        dropped = self.dropout(pooled)
        logits = self.fc(dropped)
        return self.sigmoid(logits)
        # return self.fc(lstm_output[:, -1, :])

In [62]:
model = LSTMModel(embedding_size=64, hidden_size=16).to(device)

In [81]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    losses = []

    for idx, batch in enumerate(dataloader):
        input_ids = batch["input_ids"].to(device)
        # print(input_ids.size(0)) 16
        # labels = batch["labels"].to(device)
        labels = batch["labels"]
        labels = labels.type(torch.LongTensor).to(device)
        optimizer.zero_grad()
        outputs = model(input_ids)
        # predicted = torch.round(outputs)
        # print(outputs)
        # print(labels)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (outputs.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        losses.append(loss.item())
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f} | loss {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count, sum(losses)/ len(losses)))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader): 
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch in test_dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids)
            predicted = torch.round(outputs)
            total += labels.size(0)
            correct += (predicted.argmax(1) == labels).sum().item()
    return correct / total


In [83]:
from torch.utils.data.dataset import random_split

# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 16 # batch size for training
# criterion = nn.BCELoss()
criterion = nn.CrossEntropyLoss()

total_accu = None
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

# Define the datasets and dataloaders
max_length = 100
train_dataset = TwitterDataset(train_df, tokenizer, max_length)
test_dataset = TwitterDataset(test_df, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(split_valid_, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

print(len(train_dataloader)) #380 bz 64 1519 bz 16

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

1519
| epoch   1 |   500/ 1519 batches | accuracy    0.931 | loss    0.383
| epoch   1 |  1000/ 1519 batches | accuracy    0.931 | loss    0.382
| epoch   1 |  1500/ 1519 batches | accuracy    0.927 | loss    0.384
-----------------------------------------------------------
| end of epoch   1 | time: 27.87s | valid accuracy    0.929 
-----------------------------------------------------------
| epoch   2 |   500/ 1519 batches | accuracy    0.929 | loss    0.384
| epoch   2 |  1000/ 1519 batches | accuracy    0.927 | loss    0.385
| epoch   2 |  1500/ 1519 batches | accuracy    0.932 | loss    0.384
-----------------------------------------------------------
| end of epoch   2 | time: 28.47s | valid accuracy    0.929 
-----------------------------------------------------------
| epoch   3 |   500/ 1519 batches | accuracy    0.930 | loss    0.383
| epoch   3 |  1000/ 1519 batches | accuracy    0.930 | loss    0.383
| epoch   3 |  1500/ 1519 batches | accuracy    0.930 | loss    0.383
---