# Install and imports Huggingface Transformers

In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 2.3MB 24.9MB/s 
[K     |████████████████████████████████| 3.3MB 41.2MB/s 
[K     |████████████████████████████████| 901kB 45.4MB/s 
[?25h

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from tqdm.auto import tqdm

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# prepare data

## Load preprocessed text data

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/TCSS_555_Spring/opspam.csv')

In [None]:
df

Unnamed: 0,text,is_truthful,polarity,hotel_name,source,fold,filename,target_labels
0,"A recent stay at the James Hotel-Chicago, reve...",0,negative,james,MTurk,1,d_james_16.txt,"[0, 0]"
1,James Chicago; the luxurious nice hotel as it ...,0,negative,james,MTurk,1,d_james_14.txt,"[0, 0]"
2,I stayed this hotel for 2 nights. I had high h...,0,negative,sofitel,MTurk,1,d_sofitel_19.txt,"[0, 0]"
3,I stayed at the Hilton Chicago back in July. F...,0,negative,hilton,MTurk,1,d_hilton_3.txt,"[0, 0]"
4,"While visiting the Chicago area, we chose the ...",0,negative,monaco,MTurk,1,d_monaco_19.txt,"[0, 0]"
...,...,...,...,...,...,...,...,...
1595,We arrived for a weekend stay and really enjoy...,1,positive,amalfi,TripAdvisor,5,t_amalfi_13.txt,"[1, 1]"
1596,We stayed here through Hotwire and got an amaz...,1,positive,intercontinental,TripAdvisor,5,t_intercontinental_9.txt,"[1, 1]"
1597,Stayed here for two days while attending Lollo...,1,positive,intercontinental,TripAdvisor,5,t_intercontinental_6.txt,"[1, 1]"
1598,We have stayed here several times and have alw...,1,positive,allegro,TripAdvisor,5,t_allegro_19.txt,"[1, 1]"


## Train test split

In [None]:
from sklearn.model_selection import train_test_split
texts=df['text'].tolist()
labels=df['is_truthful'].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2, random_state=42)

In [None]:
train_size=len(train_texts)
val_size=len(val_texts)

## Use pretrained Bert Tokenizer to tokenize the plain review text

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
# test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Establishing Model and DataLoader

## define some hyperparameters

In [None]:
num_words = [len(s.split()) for s in df['text']]
print('mean words', np.mean(num_words))
print('95% percentile', np.percentile(num_words, 95))

mean words 148.775
95% percentile 312.0


In [None]:
# Defining some hyperparameters
MAX_LEN = 300
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Costomized Dataset

In [None]:
import torch

class OPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OPDataset(train_encodings, train_labels)
val_dataset = OPDataset(val_encodings, val_labels)
# test_dataset = OPDataset(test_encodings, test_labels)

## Design the Bert classfier model
adds two linear layers and dropout on top of bert to get the final output for the model. 

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l1 = torch.nn.Linear(768, 256)
        self.l2 = torch.nn.Dropout(0.5)
        self.l3 = torch.nn.Linear(256, 2)
    
    def forward(self, ids, mask, token_type_ids):
        output= self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids).pooler_output
        output = torch.nn.functional.relu(self.l1(output))
        # output = self.l2(output)
        output = self.l3(output)
        return output

model = BERTClass().to(device)
for param in model.bert.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Loss function and optimizer

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# Training, fine tuning and validation

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_dataloader= DataLoader(val_dataset, batch_size=16, shuffle=True)


optim = AdamW(model.parameters(), lr=5e-5)

for epoch in tqdm(range(15)):
    model.train()
    running_loss = 0.0
    correct = 0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        running_loss += loss.item()
        predictions = outputs.logits.argmax(-1)
        correct += (predictions == labels).float().sum()
        
        
    print(f"epoch:{epoch}")
    print("Loss:", running_loss / batch["input_ids"].shape[0])
    accuracy = 100 * correct / train_size
    print("Training accuracy:", accuracy.item())
    
    
    
    model.eval()

    correct = 0
    for batch in validation_dataloader:
  
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        
        running_loss += loss.item()
        predictions = outputs.logits.argmax(-1)
        correct += (predictions == labels).float().sum()
        
    
    print("Loss:", running_loss / batch["input_ids"].shape[0])
    accuracy = 100 * correct / val_size
    print("validation accuracy:", accuracy.item())
    
    

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

Loss: 2.537247434258461
Training accuracy: 73.984375
Loss: 2.8886683946475387
validation accuracy: 87.8125
Loss: 1.2810057518072426
Training accuracy: 90.3125
Loss: 1.6491889224853367
validation accuracy: 88.4375
Loss: 0.5988691329839639
Training accuracy: 95.9375
Loss: 1.0961511474451981
validation accuracy: 85.3125
Loss: 0.32771822542417794
Training accuracy: 97.8125
Loss: 0.7478442791907582
validation accuracy: 90.3125
Loss: 0.2737857249449007
Training accuracy: 97.65625
Loss: 1.1428481279290281
validation accuracy: 81.5625
Loss: 0.10916148671822157
Training accuracy: 99.140625
Loss: 0.635468909444171
validation accuracy: 88.4375
Loss: 0.047823757835431024
Training accuracy: 99.609375
Loss: 0.595845603824273
validation accuracy: 91.25
Loss: 0.02403394487555488
Training accuracy: 99.765625
Loss: 0.6291664650307212
validation accuracy: 91.5625
Loss: 0.008856312922944198
Training accuracy: 100.0
Loss: 1.1425521569890407
validation accuracy: 84.6875
Loss: 0.005612382355138834
Training a