## Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pandas as pd

In [None]:
from pandas.io.json import json_normalize

f = open('/content/drive/My Drive/friends_train.json')
data = json.load(f)

train_set = pd.concat([pd.DataFrame(x) for x in data], ignore_index=False)
train_df = pd.DataFrame ({'text':train_set.utterance,'label':train_set.emotion})
train_df.shape

(10561, 2)

## Text Pre-processing

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 20.5MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 53.0MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 58.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |█

In [None]:
import torch
from transformers import BertTokenizer

if torch.cuda.is_available():
  device = torch.device('cuda')
  print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
  print('No GPU available, using the CPU instead.')
  device = torch.device('cpu')

We will use the GPU: Tesla T4


In [None]:
new_labels = train_df.label.unique()

label_dict ={}
for index, label in enumerate(new_labels):
    label_dict[label] = index
label_dict

{'anger': 6,
 'disgust': 7,
 'fear': 2,
 'joy': 4,
 'neutral': 0,
 'non-neutral': 3,
 'sadness': 5,
 'surprise': 1}

In [None]:
train_df['newlabel'] = train_df['label'].replace(label_dict)
train_df.head(20)

Unnamed: 0,text,label,newlabel
0,also I was the point person on my companys tr...,neutral,0
1,You mustve had your hands full.,neutral,0
2,That I did. That I did.,neutral,0
3,So lets talk a little bit about your duties.,neutral,0
4,My duties? All right.,surprise,1
5,"Now youll be heading a whole division, so you...",neutral,0
6,I see.,neutral,0
7,But therell be perhaps 30 people under you so...,neutral,0
8,Good to know.,neutral,0
9,We can go into detail,neutral,0


In [None]:
sentences = train_df.text.values
labels = train_df.newlabel.values

## BERT Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case =True)

print("Original: ", train_df.text[1])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Original:  1                     You mustve had your hands full.
1    Hey-hey-hey. You wanna hear something that sucks.
1                                                  Hi!
1                   You liked it? You really liked it?
1    What?! What is with everybody? Its Thanksgivi...
                           ...                        
1                  Bing! You got those numbers for me?
1               But you found the keys to his clothes?
1                                                 Hey!
1        Umm, slight change of plans. We've shut down.
1                           Yeah, I forget which ones.
Name: text, Length: 720, dtype: object


In [None]:
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

Tokenized:  ['also', 'i', 'was', 'the', 'point', 'person', 'on', 'my', 'company', '##s', 'transition', 'from', 'the', 'k', '##l', '-', '5', 'to', 'gr', '-', '6', 'system', '.']


In [None]:
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Token IDs:  [2036, 1045, 2001, 1996, 2391, 2711, 2006, 2026, 2194, 2015, 6653, 2013, 1996, 1047, 2140, 1011, 1019, 2000, 24665, 1011, 1020, 2291, 1012]


In [None]:
max_len = 0 

for text in sentences:
    input_ids = tokenizer. encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max sentence length:', max_len)

Max sentence length: 95


In [None]:
encoded_dict = tokenizer.batch_encode_plus(
        train_df.text.values,
        add_special_tokens = True,
        max_length = 120,
        padding = True,
        return_attention_mask = True,
        return_tensors = 'pt')
    

In [None]:
input_ids = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']
labels = torch.tensor(train_df.newlabel.values)

In [None]:
input_ids.size(0)

10561

## Training and Validation Split

In [None]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids,attention_masks,labels)

In [None]:
train_size = int(0.9* len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset,[train_size,val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

9,504 training samples
1,057 validation samples


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 

In [None]:
batch_size = 16

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = batch_size
)

## Bert For Sequence Classification

In [None]:
from transformers import BertForSequenceClassification,AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels =8,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Optimizer and Learning Rate

In [None]:
optimizer = AdamW(model.parameters(),
                 lr = 2e-5,
                 eps = 1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 3

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps =0,
                                           num_training_steps = total_steps)

## Model Training

In [None]:
import numpy as np
from sklearn.metrics import f1_score

def flat_accuracy(preds,labels):
    pred_flat = np.argmax(preds,axis =1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def f1_score_func(preds, labels):
    pred_flat = np.argmax(preds,axis =1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average ='weighted')

In [None]:
def evaluate(validation_dataloader):

    model.eval()

    total_eval_accuracy =0
    total_eval_loss = 0
    predictions, true_labels = [],[]

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask = b_input_mask,
                           labels = b_labels)

        loss = outputs[0]
        logits = outputs[1]
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

    total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    predictions = np.concatenate(predictions, axis =0)
    true_labels= np.concatenate( true_labels, axis =0)

    return avg_val_loss, predictions, true_labels

In [None]:
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

training_stats = []
# tital_t0 = time.time()

for epoch_i in range(0,epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    # t0 = time.time()
    
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad() 
        
        loss,logits =model(b_input_ids,
                          token_type_ids =None,
                          attention_mask = b_input_mask,
                          labels = b_labels)
        
        total_train_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
   
    print("Average training loss: {0:.2f}".format(avg_train_loss))
    
    val_loss, predictions,  true_labels = evaluate(validation_dataloader)
    val_f1 = f1_score_func(predictions,  true_labels)    


Training...
Average training loss: 1.26

Training...
Average training loss: 0.99

Training...
Average training loss: 0.81


## Model Evaluation

In [None]:
_, predictions, true_labels = evaluate(validation_dataloader)
flat_accuracy(predictions, true_labels)

0.6064333017975402

In [None]:
model.state_dict().keys()

odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weigh

## Save Model

In [None]:
model_save_name = 'tone-analyzer.pt'
path = F"/content/drive/My Drive/tone-analyzer.pt" 
torch.save(model.state_dict(), path)


In [None]:
model_save_name = 'entire-tone-analyzer.pt'
path = F"/content/drive/My Drive/entire-tone-analyzer.pt" 
torch.save(model, path)

In [None]:
# model_save_name = 'tone-analyzer.pt'
# path = F"/content/drive/My Drive/tone-analyzer.pt" 
# model.load_state_dict(torch.load(path))