In [6]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split

In [29]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

In [4]:
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [7]:
# loading train data
data = pd.read_csv("train_data.csv")

# dividing data into review and rating
reviews = data['review'].values
ratings = data['rating'].values

# dividing data into train and validation set
train_reviews, val_reviews, train_ratings, val_ratings = train_test_split(reviews, ratings, test_size=0.2, random_state=42)

# loading test data
test_data = pd.read_csv("test_data.csv")

test_reviews = test_data['review'].values

In [14]:
from datasets import DatasetDict, Dataset

# creating train dataset
train_data = {'review': train_reviews, 'rating': train_ratings, 'idx': list(range(len(train_reviews)))}
train_dataset = Dataset.from_dict(train_data)

# creating validation dataset
val_data = {'review': val_reviews, 'rating': val_ratings, 'idx': list(range(len(val_reviews)))}
val_dataset = Dataset.from_dict(val_data)

# creating test dataset
test_data = {'review': test_reviews, 'rating': [0] * len(test_reviews), 'idx': list(range(len(test_reviews)))}
test_dataset = Dataset.from_dict(test_data)

# creating DatasetDict object
data_dict = DatasetDict({'train': train_dataset, 'validation': val_dataset, 'test': test_dataset})

print(data_dict)

DatasetDict({
    train: Dataset({
        features: ['review', 'rating', 'idx'],
        num_rows: 13113
    })
    validation: Dataset({
        features: ['review', 'rating', 'idx'],
        num_rows: 3279
    })
    test: Dataset({
        features: ['review', 'rating', 'idx'],
        num_rows: 4099
    })
})


In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

In [None]:
sentiment_model

In [19]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)


In [36]:
tokenized_datasets = data_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["review","idx"])
tokenized_datasets.set_format("torch")
train_dataset = tokenized_datasets["train"]#.shuffle(seed=42).select(range(10000))
eval_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

Map:   0%|          | 0/13113 [00:00<?, ? examples/s]

Map:   0%|          | 0/3279 [00:00<?, ? examples/s]

Map:   0%|          | 0/4099 [00:00<?, ? examples/s]

In [37]:
print(train_dataset)

Dataset({
    features: ['rating', 'input_ids', 'attention_mask'],
    num_rows: 13113
})


In [38]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(eval_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset,  batch_size=16)

In [None]:
optimizer = Adam(sentiment_model.parameters(), lr=5e-5)
sentiment_model.to(device)

In [28]:
# calculating weight for each class
class_counts = data['rating'].value_counts().sort_index().values
print(class_counts)
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float, device=device)
print(class_weights)

[1137 1434 1747 4831 7243]
tensor([0.0009, 0.0007, 0.0006, 0.0002, 0.0001], device='cuda:0')


In [48]:
sentiment_model.load_state_dict(torch.load("sentiment_model_dict.pt"))

<All keys matched successfully>

In [49]:
num_epochs = 10
loss_fun = nn.CrossEntropyLoss(weight=class_weights)

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(train_dataloader):

        labels = batch["rating"].to(device)
        batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}
        outputs = sentiment_model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()

        optimizer.step()
#         lr_scheduler.step()
        optimizer.zero_grad()
#         progress_bar.update(1)
        losses.append(loss.item())
    print(np.mean(losses))

torch.save(sentiment_model.state_dict(),"sentiment_model_dict.pt")

100%|██████████| 820/820 [09:57<00:00,  1.37it/s]


0.20906379648552434


In [50]:
metric = evaluate.load("accuracy")
sentiment_model.eval()
for batch in eval_dataloader:
    labels = batch["rating"].to(device)
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.6239707227813358}

In [51]:
# generating predictions on test data
test_predicted = []
for batch in test_dataloader:
    labels = batch["rating"].to(device)
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    test_predicted.extend(predictions.cpu().numpy())

with open("poniedzialek_grunwald_rozkosz.csv", "a") as results_file:
    for prediction in test_predicted:
        results_file.write(str(prediction.item()) + "\n")