In [1]:
!pip install -qq transformers

[K     |████████████████████████████████| 2.6 MB 9.4 MB/s 
[K     |████████████████████████████████| 636 kB 50.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 65.4 MB/s 
[K     |████████████████████████████████| 895 kB 53.3 MB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForSequenceClassification
import torch

import numpy as np
import pandas as pd
from tqdm import tqdm 
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from tqdm.auto import tqdm

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

%matplotlib inline

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
path = '/content/drive/MyDrive/aidea_sentiment_analysis'
imdb_data = pd.read_csv(f'{path}/data/processed_train.csv')
imdb_test_data = pd.read_csv(f'{path}/data/processed_test.csv')

materials = pd.read_csv(f'{path}/materials/processed_IMDB_Dataset.csv')

In [5]:
imdb_data

Unnamed: 0,ID,review,sentiment,processed_review
0,41411,I watched this film because I'm a big fan of R...,0,watch film m big fan river phoenix joaquin pho...
1,37586,It does not seem that this movie managed to pl...,1,movie manage lot people see place bump acciden...
2,6017,"Enough is not a bad movie , just mediocre .",0,bad movie mediocre
3,44656,my friend and i rented this one a few nights a...,0,friend rent night ago single good movie see me...
4,38711,"Just about everything in this movie is wrong, ...",0,movie wrong wrong wrong mike myers example s r...
...,...,...,...,...
29336,8019,It 's one of the most honest films ever made a...,1,s honest film hollywood
29337,453,An absorbing and unsettling psychological drama .,1,absorb unsettling psychological drama
29338,13097,"Soylent Green IS...a really good movie, actual...",1,soylent green good movie actually ve think don...
29339,26896,There just isn't enough here. There a few funn...,0,isn funny spot disappoint love stupid movie ex...


In [6]:
imdb_test_data

Unnamed: 0,ID,review,processed_review
0,22622,Robert Lansing plays a scientist experimenting...,robert lansing play scientist experiment pass ...
1,10162,"Well I've enjoy this movie, even though someti...",ve enjoy movie turn stereotypical situation nt...
2,17468,First things first - though I believe Joel Sch...,thing believe joel schumacher well mediocre di...
3,42579,I watched this movie on the grounds that Amber...,watch movie ground amber benson rock nick stah...
4,701,A certain sexiness underlines even the dullest...,certain sexiness underline dull tangent
...,...,...,...
29336,30370,It is difficult to rate a writer/director's fi...,difficult rate writer director s effort movie ...
29337,18654,"After watching this movie once, it quickly bec...",watch movie quickly favorite different event h...
29338,47985,"Even though i sat and watched the whole thing,...",sit watch thing good place big chunk informati...
29339,9866,Warning Spoilers following. Superb recreation ...,warn spoiler follow superb recreation base ant...


In [7]:
materials

Unnamed: 0,review,sentiment,processed_review
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production. the filming tec...
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"petter mattei's ""love in the time of money"" is..."
...,...,...,...
49995,I thought this movie did a down right good job...,1,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,0,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,0,i'm going to have to disagree with the previou...


### Some preprocessing

In [8]:
# filtered weird sentiment data
imdb_data = imdb_data[~imdb_data['sentiment'].str.contains(r'[^0-9]')]

In [9]:
imdb_data['sentiment'] = imdb_data['sentiment'].astype(np.int8)
imdb_data['processed_review'] = imdb_data['processed_review'].astype(str)

imdb_test_data['processed_review'] = imdb_test_data['processed_review'].astype(str)

materials['sentiment'] = materials['sentiment'].astype(np.int8)
materials['processed_review'] = materials['processed_review'].astype(str)

imdb_data.reset_index(drop=True, inplace=True)

materials.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Load fine-tuned model

In [10]:
MAX_LEN = 512

num_labels = len(materials['sentiment'].unique())

class_names = [0, 1]

In [13]:
# config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels)
# bert_classifier = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                                 config=config)
# # for param in bert_classifier.base_model.parameters():
# #   param.requires_grad = False

# bert_classifier.to(device)
from transformers import BertConfig, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(n_classes, epochs):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    # bert_classifier = BertClassifier(n_classes, freeze_bert=True)
    
    config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels)
    bert_classifier = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                                    config=config)
    for param in bert_classifier.base_model.parameters():
      param.requires_grad = False

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    
                      eps=1e-5,
                      correct_bias=False    
                      )

    loss_fn = nn.CrossEntropyLoss().to(device)

    bert_classifier = bert_classifier.to(device)
    loss_fn = loss_fn.to(device)
    
    return bert_classifier, optimizer, loss_fn

In [14]:
bert_classifier, optimizer, loss_fn = initialize_model(num_labels, epochs=4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
# get predictions for test data
# path = 'saved_weights.pt'
p = path + '/model/best_model_state.bin'

# bert_classifier.load_state_dict(torch.load(p, map_location=device))
bert_classifier.load_state_dict(torch.load(p))

RuntimeError: ignored

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Inference by model

In [None]:
# example
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
output = bert_classifier(input_ids, attention_mask)

class_names = [0, 1]

prediction = torch.max(output.logits, dim=-1)

print(f'Prediction : {prediction}')
print(f'Review text: {s}')
print(f'Sentiment  : {class_names[prediction.indices]}')

In [None]:
def model_inference(bert_classifier, text_lst, max_len, class_names):

  result = []

  for txt in tqdm(text_lst):

    # encode text
    encoded_sent = tokenizer.encode_plus(
        text = txt,  # Preprocess sentence
        add_special_tokens = True,        # Add `[CLS]` and `[SEP]`
        truncation = True,               # Truncate string
        max_length = max_len,      # Max length to truncate/pad
        padding = 'max_length',         # Pad sentence to max length
        return_tensors = 'pt',           # Return PyTorch tensor
        return_attention_mask = True      # Return attention mask
        )
    
    input_ids = encoded_sent['input_ids'].to(device)
    attention_mask = encoded_sent['attention_mask'].to(device)

    output = bert_classifier(input_ids, attention_mask)
    
    prediction = torch.max(output.logits, dim=-1)

    result.append(class_names[prediction.indices])

  return result

In [None]:
%%time
train_text = imdb_data['processed_review'].tolist()

imdb_data['pred_sentiment'] = model_inference(bert_classifier=bert_classifier, text_lst=train_text, max_len=MAX_LEN, class_names=class_names)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

imdb_data['pred_sentiment'] = imdb_data['pred_sentiment'].astype(np.int8)

print(accuracy_score(imdb_data['sentiment'], imdb_data['pred_sentiment']))

In [None]:
print(classification_report(imdb_data['sentiment'], imdb_data['pred_sentiment']))

In [None]:
%%time
test_text = imdb_test_data['processed_review'].tolist()

imdb_test_data['pred_sentiment'] = model_inference(bert_classifier=bert_classifier, text_lst=test_text, max_len=MAX_LEN, class_names=class_names)

In [None]:
submission = imdb_test_data[['ID', 'pred_sentiment']]
submission.rename({'pred_sentiment':'sentiment'}, axis=1,inplace=True)
submission.to_csv(f'{path}/data/raw_baseline_submission.csv', index=0)

In [None]:
submission