In [None]:
# !pip install transformers

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel
import matplotlib.pyplot as plt
from sklearn.utils import class_weight

In [2]:
data_path = "../input/amazon-fine-food-reviews/Reviews.csv"

In [3]:
df = pd.read_csv(data_path)

In [4]:
df.info()

In [5]:
np.unique(df.Score,return_counts=True)

In [6]:
print("Negative review count:",sum(df.Score <=2))
print("Positive review count:",sum(df.Score >3))

In [7]:
df.loc[df['Score'] <=2, 'Sentiment'] = int(0)
df.loc[df['Score'] > 3, 'Sentiment'] = int(1)

In [8]:
df.head(2)

In [9]:
df.drop(df[df['Score']==3].index,inplace=True)

In [10]:
df.shape

In [11]:
np.unique(df.Score,return_counts=True)

In [12]:
print("Negative review count:",sum(df.Score <=2))
print("Positive review count:",sum(df.Score >3))

In [13]:
df_sentiment = df[['Text','Sentiment']]

In [14]:
df_sentiment.head()

In [15]:
np.unique(df_sentiment.Sentiment, return_counts=True)

In [16]:
df_sentiment.Sentiment.value_counts().sort_values().plot(kind = 'barh')

In [17]:
from bs4 import BeautifulSoup
import re,string,unicodedata

In [18]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
df_sentiment['Text']=df_sentiment['Text'].apply(denoise_text)

In [19]:
df_sentiment.isnull().sum()

In [20]:
df_sentiment.head()

In [21]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [22]:
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [23]:
stemmer = PorterStemmer()
def stem_text(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            word = stemmer.stem(i.strip())
            final_text.append(word)
    return " ".join(final_text)

In [24]:
df_sentiment.Text = df_sentiment.Text.apply(stem_text)

In [25]:
df_sentiment.head()

In [26]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [27]:
class AmazonFineFoodReviews(torch.utils.data.Dataset):
    
    def __init__(self, df):
        
        self.df = df
        self.maxlen = 256
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        review = self.df['Text'].iloc[index].split()
        review = ' '.join(review)
        sentiment = int(self.df['Sentiment'].iloc[index])

        encodings = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encodings.input_ids.flatten(),
            'attention_mask': encodings.attention_mask.flatten(),
            'labels': torch.tensor(sentiment, dtype=torch.long)
        }

In [28]:
train_df, test_df = train_test_split(df_sentiment, test_size=0.3, random_state=42)
len(train_df), len(test_df)

In [29]:
train_dataset = AmazonFineFoodReviews(train_df)
valid_dataset = AmazonFineFoodReviews(test_df)


In [30]:
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=32
)

In [31]:
np.unique(train_loader.dataset.df['Sentiment'])

In [32]:
for batch in train_loader:
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['labels'].shape)
    break

In [33]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.drop0 = nn.Dropout(0.25)
        self.linear1 = nn.Linear(3072, 512)
        self.relu1 = nn.ReLU()
        self.drop1 = nn.Dropout(0.25)
        self.linear2 = nn.Linear(512, 2)
        self.relu2 = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        pooled_output = torch.cat(tuple([last_hidden_state[:, i] for i in [-4, -3, -2, -1]]), dim=-1)
        x = self.drop0(pooled_output)
        x = self.relu1(self.linear1(x))
        x = self.drop1(x)
        x = self.relu2(self.linear2(x))
        return x

In [34]:
model = SentimentClassifier()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [38]:
np.array(train_loader.dataset.df['Sentiment'])

In [39]:
class_weights=class_weight.compute_class_weight(
    class_weight = 'balanced',
    classes = np.unique(train_loader.dataset.df['Sentiment']),
    y = np.array(train_loader.dataset.df['Sentiment'])
)

In [40]:
class_weights=torch.tensor(class_weights,dtype=torch.float)

In [41]:
print(class_weights)

In [43]:
criterion = nn.CrossEntropyLoss(weight=class_weights,reduction='mean').to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
epochs = 2

In [45]:
for epoch in range(epochs):
    
  # TRAIN
    model.train()
    train_loop = tqdm(train_loader)
    for batch in train_loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        output = model(input_ids, attention_mask)
        loss = criterion(output, labels)
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loop.set_description(f"Training Epoch: {epoch}")
        train_loop.set_postfix(loss=loss.item())

In [None]:
# VALIDATION
val_accuracy = []
val_loss = []
all_output = []
model.eval()
valid_loop = tqdm(valid_loader)
for batch in valid_loop:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    output = model(input_ids, attention_mask)
    all_output.append(output)
    
    loss = criterion(output, labels)
    val_loss.append(loss.item())
    
    preds = torch.argmax(output, dim=1).flatten()
    
    accuracy = (preds == labels).cpu().numpy().mean() * 100
    val_accuracy.append(accuracy)
    
    valid_loop.set_description(f"Validation Epoch: {epoch}")
    valid_loop.set_postfix(loss=loss.item())
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
all_output = torch.cat(all_output, dim=0)
probs = F.softmax(all_output, dim=1).cpu().numpy()

In [47]:
test_sample = test_df['Text'].iloc[100]
original_label = test_df['Sentiment'].iloc[100]

print(test_sample)
print(original_label)

encodings = tokenizer.encode_plus(
    test_sample,
    add_special_tokens=True,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

with torch.no_grad():
    model.to('cpu')
    preds = model(encodings['input_ids'].to('cpu'), encodings['attention_mask'].to('cpu'))
    preds = np.argmax(preds)
    output = preds.item()
    print(output+1)

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
evaluate_roc(probs, test_df['Sentiment'])