In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install transformers

In [3]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel

###**Data Preparation**

In [4]:
data_path = "/content/drive/MyDrive/datasets/test.ft.txt"

In [5]:
'''
sentiments = {
  __label__1: 1-2 stars,
  __label__2: 4-5 stars
}
'''

with open(data_path) as f:
  data = f.readlines()

In [6]:
sentiments = []
reviews = []

for review in data[:10000]:
  sentiments.append(review[9])
  reviews.append(review[10:])

In [None]:
len(sentiments), len(reviews)

(10000, 10000)

In [7]:
data_dict = {
    'reviews': reviews,
    'sentiment': sentiments
}

df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,reviews,sentiment
0,Great CD: My lovely Pat has one of the GREAT ...,2
1,One of the best game music soundtracks - for ...,2
2,Batteries died within a year ...: I bought th...,1
3,"works fine, but Maha Energy is better: Check ...",2
4,Great for the non-audiophile: Reviewed quite ...,2


In [None]:
df.isnull().sum()

reviews      0
sentiment    0
dtype: int64

In [8]:
mapping = {
    '1': 0,
    '2': 1
}

df['sentiment'] = df['sentiment'].map(mapping)

In [None]:
df['sentiment'].value_counts()

1    5125
0    4875
Name: sentiment, dtype: int64

###**Create DistilBert Tokenizer**

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Example of Tokenizer usage

In [None]:
encodings = tokenizer.batch_encode_plus(
    df['reviews'].iloc[:10],
    add_special_tokens=True,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

In [None]:
encodings.input_ids[0]

tensor([  101,  2307,  3729,  1024,  2026,  8403,  6986,  2038,  2028,  1997,
         1996,  2307,  5755,  1997,  2014,  4245,  1012,  1045,  2031,  7791,
         2000,  2023,  3729,  2005,  2086,  1998,  1045,  2145,  2293,  2009,
         1012,  2043,  1045,  1005,  1049,  1999,  1037,  2204,  6888,  2009,
         3084,  2033,  2514,  2488,  1012,  1037,  2919,  6888,  2074,  9345,
        17822,  8520,  2066,  5699,  1999,  1996,  4542,  1012,  2023,  3729,
         2074,  1051, 18153,  2229,  2166,  1012,  2955,  2024, 18414, 16846,
        24646,  4609,  5582,  1998,  4581,  2074,  3102,  1012,  2028,  1997,
         2166,  1005,  1055,  5023, 20296,  1012,  2023,  2003,  1037,  5532,
         8842,  3729,  1999,  2026,  2338,  1012,  2339,  2016,  2196,  2081,
         2009,  2502,  2003,  2074,  3458,  2033,  1012,  2296,  7292,  1045,
         2377,  2023,  1010,  2053,  3043,  2304,  1010,  2317,  1010,  2402,
         1010,  2214,  1010,  3287,  1010,  2931,  7955,  2758, 

###**Create Pytorch Dataset and Dataloader**

In [10]:
class AmazonReviewsDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df
    self.maxlen = 256
    self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    review = self.df['reviews'].iloc[index].split()
    review = ' '.join(review)
    sentiment = int(self.df['sentiment'].iloc[index])

    encodings = self.tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=self.maxlen,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'input_ids': encodings.input_ids.flatten(),
        'attention_mask': encodings.attention_mask.flatten(),
        'labels': torch.tensor(sentiment, dtype=torch.long)
    }

In [11]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
len(train_df), len(test_df)

(7000, 3000)

In [12]:
train_dataset = AmazonReviewsDataset(train_df)
valid_dataset = AmazonReviewsDataset(test_df)

In [13]:
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=32
)

In [None]:
for batch in train_loader:
  print(batch['input_ids'].shape)
  print(batch['attention_mask'].shape)
  print(batch['labels'].shape)
  break

torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])


###**Model Creation**

In [14]:
class SentimentClassifier(nn.Module):
  def __init__(self):
    super(SentimentClassifier, self).__init__()
    self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self.drop0 = nn.Dropout(0.25)
    self.linear1 = nn.Linear(3072, 512)
    self.relu1 = nn.ReLU()
    self.drop1 = nn.Dropout(0.25)
    self.linear2 = nn.Linear(512, 2)
    self.relu2 = nn.ReLU()

  def forward(self, input_ids, attention_mask):
    outputs = self.distilbert(input_ids, attention_mask)
    last_hidden_state = outputs[0]
    pooled_output = torch.cat(tuple([last_hidden_state[:, i] for i in [-4, -3, -2, -1]]), dim=-1)
    x = self.drop0(pooled_output)
    x = self.relu1(self.linear1(x))
    x = self.drop1(x)
    x = self.relu2(self.linear2(x))
    return x

In [None]:
model = SentimentClassifier()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

###**Train Loop**

In [19]:
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
epochs = 5

In [21]:
for epoch in range(epochs):
  # TRAIN
  model.train()
  train_loop = tqdm(train_loader)
  for batch in train_loop:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    output = model(input_ids, attention_mask)
    loss = criterion(output, labels)
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)
    optimizer.step()

    train_loop.set_description(f"Training Epoch: {epoch}")
    train_loop.set_postfix(loss=loss.item())

  # VALIDATION
  model.eval()
  valid_loop = tqdm(valid_loader)
  for batch in valid_loop:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    output = model(input_ids, attention_mask)
    loss = criterion(output, labels)

    valid_loop.set_description(f"Validation Epoch: {epoch}")
    valid_loop.set_postfix(loss=loss.item())

  0%|          | 0/219 [00:00<?, ?it/s]

  del sys.path[0]


  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

In [25]:
test_sample = test_df['reviews'].iloc[100]
original_label = test_df['sentiment'].iloc[100]

print(test_sample)
print(original_label)

encodings = tokenizer.encode_plus(
    test_sample,
    add_special_tokens=True,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

with torch.no_grad():
  model.to('cpu')
  preds = model(encodings['input_ids'].to('cpu'), encodings['attention_mask'].to('cpu'))
  preds = np.argmax(preds)
  output = preds.item()
  print(output+1)