In [33]:
import torch

In [34]:
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz .

/bin/bash: aws: command not found


In [243]:
import pandas as pd
df = pd.read_csv("amazon_reviews_us_Camera_v1_00.tsv.gz",  sep='\t', compression='gzip', error_bad_lines=False, dtype='str')

b'Skipping line 85458: expected 15 fields, saw 22\nSkipping line 91161: expected 15 fields, saw 22\n'
b'Skipping line 166123: expected 15 fields, saw 22\n'
b'Skipping line 225458: expected 15 fields, saw 22\nSkipping line 229936: expected 15 fields, saw 22\nSkipping line 259297: expected 15 fields, saw 22\n'
b'Skipping line 284728: expected 15 fields, saw 22\nSkipping line 286334: expected 15 fields, saw 22\nSkipping line 293400: expected 15 fields, saw 22\nSkipping line 294415: expected 15 fields, saw 22\nSkipping line 308150: expected 15 fields, saw 22\nSkipping line 315022: expected 15 fields, saw 22\nSkipping line 315730: expected 15 fields, saw 22\nSkipping line 316071: expected 15 fields, saw 22\nSkipping line 326729: expected 15 fields, saw 22\n'
b'Skipping line 329101: expected 15 fields, saw 22\nSkipping line 333077: expected 15 fields, saw 22\nSkipping line 377031: expected 15 fields, saw 22\nSkipping line 389496: expected 15 fields, saw 22\nSkipping line 390486: expected 15 

In [247]:
# org_df = df
# df = org_df

In [248]:
df.dropna(inplace=True)
df = df[(df["review_body"].astype(str).str.len() >10)]
df = df[ (df["star_rating"] == "1") | (df["star_rating"] == "5")]
df = df[["review_body","star_rating"]]
df["star_rating"] =  df["star_rating"].astype(int)
df["star_rating"] =  df["star_rating"].apply(lambda x: 1 if x == 5 else 0)
# df = df[:100_000]
df.head()

Unnamed: 0,review_body,star_rating
1,"Perfect, even sturdier than the original!",5
3,Exactly what I wanted and expected. Perfect fo...,5
4,I will look past the fact that they tricked me...,5
7,Great camera for the price.,5
8,Product is very good and satisfactory.<br /><b...,5


In [256]:
df["star_rating"].value_counts()

1    1020166
0     168733
Name: star_rating, dtype: int64

In [257]:
#BERT
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [258]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = 250

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):

        review = self.reviews.iloc[item]
        target = self.targets.iloc[item]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [259]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["star_rating"].values
)

train_ds = ReviewDataset(reviews = df_train["review_body"], targets=df_train["star_rating"], tokenizer=tokenizer)
train_dl = DataLoader(train_ds, batch_size=16, num_workers=4)

test_ds = ReviewDataset(reviews = df_test["review_body"], targets=df_test["star_rating"], tokenizer=tokenizer)
test_dl = DataLoader(test_ds, batch_size=16, num_workers=4)


In [260]:

EPOCHS = 2
LEARNING_RATE = 1e-5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def ret_model():
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels = 2, 
        output_attentions = False, 
        output_hidden_states = False,
    )
    return model

total_steps = len(train_dl) * EPOCHS

model = ret_model()
optim = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(EPOCHS):
    print("running epoch ",epoch)
    model = ret_model()
    model.to(device)
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dl):
        input_ids = batch["input_ids"].to(device)
        input_mask = batch["attention_mask"].to(device)
        labels = batch["targets"].to(device)

        model.zero_grad()
        loss, logits = model(input_ids, 
                        token_type_ids=None,
                        attention_mask = input_mask,
                        labels = labels)
        total_loss+= loss.item()
        loss.backward()

        optim.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dl)
    print("avg_train_loss",avg_train_loss)
    model.eval()
    
    total_eval_acc = 0
    total_eval_loss = 0
    eval_steps = 0

    for batch in test_dl:
        input_ids = batch["input_ids"].cuda()
        input_mask = batch["attention_mask"].to(device)
        labels = batch["targets"].to(device)

        with torch.no_grad():
            loss, logits = model(input_ids, 
                        token_type_ids=None,
                        attention_mask = input_mask,
                        labels = labels)

            total_eval_loss+= loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            total_eval_acc += flat_accuracy(logits, label_ids)


    avg_val_acc = total_eval_acc / len(test_dl)
    print("Val acc",avg_val_acc)
    avg_val_loss = total_eval_loss / len(test_dl)
    print("val loss", avg_val_loss)


running epoch  0
Process Process-162:
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/process.py", line 261, in _bootstrap
    util._exit_function()
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/util.py", line 322, in _exit_function
    _run_finalizers()
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/util.py", line 262, in _run_finalizers
    finalizer()
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/queues.py", line 191, in _finalize_join
    thread.join()
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/threading.py", line 1056, in join
    self._wait_for_tstate_lock()
  File "/home/ubuntu/anaconda3/envs/pytorch_

KeyboardInterrupt: 

In [127]:
torch.save(model.state_dict(), "model.pth")


In [175]:
model = ret_model()
model.load_state_dict(torch.load("model.pth"))
model.eval()
print("")




In [203]:


inputs = tokenizer.encode_plus(
            'Perfect, even sturdier than the original',
            add_special_tokens=True,
            max_length=250,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]

outputs = model(ids, token_type_ids=token_type_ids, attention_mask=mask)
# outputs
preds = torch.sigmoid(outputs[0]).cpu().detach().numpy()
# preds
np.argmax(preds, axis=1)

array([0])

NameError: name 'DEVICE' is not defined