In [1]:
import numpy as np
import pandas as pd
import time
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TextClassificationPipeline
import torch

In [2]:
%%time
columns_to_load = ['app_name', 'language', 'review','recommended']
df = pd.read_csv('steam_reviews.csv', usecols=columns_to_load) 
eng_df_og = df[df['language'] == 'english']
eng_df_og = eng_df_og.dropna(subset=['review'])
eng_df_og.head(10)
eng_df = eng_df_og.sample(frac=0.01, random_state=42)
eng_df['recommended'] = eng_df['recommended'].map({True: 1, False: 0})
eng_df.reset_index(drop=True, inplace=True)
def preprocess_review(review):
    return review.lower()

eng_df['processed_review'] = eng_df['review'].apply(preprocess_review)
eng_df = eng_df.drop(columns=['review'])

CPU times: total: 1min 15s
Wall time: 1min 15s


In [4]:
train_df, valid_df = train_test_split(eng_df, test_size=0.1, random_state=1)

## FINE TUNE MODEL

In [5]:
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return min(len(self.reviews), len(self.labels))
    
    def __getitem__(self, item):
        if item >= len(self.reviews) or item >= len(self.labels):
            raise IndexError("Index out of bounds")
        review = str(self.reviews[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_dataset = ReviewDataset(train_df['processed_review'].to_numpy(), train_df['recommended'].to_numpy(), tokenizer, max_len=512)
val_dataset = ReviewDataset(train_df['processed_review'].to_numpy(), valid_df['recommended'].to_numpy(), tokenizer, max_len=512)

# Create Data Loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

Using device: cuda


In [6]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [7]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=2e-5)

# Function for training a single epoch
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        _, preds = torch.max(outputs.logits, dim=1)
        loss = outputs.loss

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)



In [8]:
# Desired epochs to save models at
save_epochs = [3, 4, 5,10]
for epoch in range(1, max(save_epochs) + 1):  # Start from epoch 1 to max(save_epochs)
    print(f'Epoch {epoch}/{max(save_epochs)}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss {train_loss}, accuracy {train_acc}')
    
    # If the current epoch is in the list of epochs to save, save the model
    if epoch in save_epochs:
        model.save_pretrained(f'./Fine_tuned_epo{epoch}')
        torch.cuda.empty_cache()  # Clear memory/cache after saving the model

Epoch 1/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:36<00:00,  5.21it/s]


Train loss 0.16235721497728484, accuracy 0.9396081693004343
Epoch 2/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:32<00:00,  5.22it/s]


Train loss 0.12100478720554594, accuracy 0.956958691433324
Epoch 3/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:44<00:00,  5.19it/s]


Train loss 0.09927873677938932, accuracy 0.9653567137972461
Epoch 4/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:25<00:00,  5.24it/s]


Train loss 0.08311219590055033, accuracy 0.9711094168745956
Epoch 5/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:26<00:00,  5.24it/s]


Train loss 0.07089159330419197, accuracy 0.9760304038443767
Epoch 6/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:30<00:00,  5.23it/s]


Train loss 0.06548078860300377, accuracy 0.9784446908788467
Epoch 7/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:32<00:00,  5.22it/s]


Train loss 0.05977226452170267, accuracy 0.9809051843637372
Epoch 8/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:31<00:00,  5.22it/s]


Train loss 0.055988363729257404, accuracy 0.9816098327326495
Epoch 9/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [35:00<00:00,  5.15it/s]


Train loss 0.05272494979844823, accuracy 0.9828458552813972
Epoch 10/10


100%|████████████████████████████████████████████████████████████████████████████| 10821/10821 [34:55<00:00,  5.16it/s]


Train loss 0.04995393273856019, accuracy 0.9843822197578782
