In [None]:
!gdown --id '1volop98hM4De7fv3ahca0ZIoMiEBHEPR' --output train.csv

Downloading...
From: https://drive.google.com/uc?id=1volop98hM4De7fv3ahca0ZIoMiEBHEPR
To: /content/train.csv
100% 178k/178k [00:00<00:00, 93.7MB/s]


## Load Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0


In [None]:
df.isnull().sum()

row_id    0
TEXT      0
LABEL     0
dtype: int64

## Text preprocessing

In [None]:
import re
import string
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

## Dataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = [clean_text(text) for text in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(label)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts = df['TEXT'].tolist()
labels = df['LABEL'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Train

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
for epoch in range(3):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    print(f'Epoch {epoch + 1}, Accuracy: {acc}')

model.save_pretrained('bert-sentiment-model')

## Predict Test data

In [None]:
!gdown --id '1oc7-oV4tIlIEMVUNRAqgaDZ6FLfpZGuB' --output train_noANS.csv

Downloading...
From: https://drive.google.com/uc?id=1oc7-oV4tIlIEMVUNRAqgaDZ6FLfpZGuB
To: /content/train_noANS.csv
100% 862k/862k [00:00<00:00, 108MB/s]


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

test_df = pd.read_csv('train_noANS.csv')

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts = test_df['TEXT'].tolist()

test_dataset = TextDataset(texts, tokenizer)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-sentiment-model')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

preds = []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())


In [None]:
test_df['PREDICTED_LABEL'] = preds

test_df.rename(columns={'PREDICTED_LABEL': 'LABEL'}, inplace=True)

test_df.head()

test_df.to_csv('test_predictions_final.csv', index=False)