In [1]:
# mount google drive to access dataset, model weights and helper functions
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# check GPU stats
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-228c4fa0-cd06-65aa-7e44-36be7871bc68)


In [3]:
!pip install transformers
import nltk
nltk.download('punkt')
from transformers import DistilBertForSequenceClassification,DistilBertTokenizer
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, TensorDataset,DataLoader
from sklearn.model_selection import train_test_split
import torch
from drive.MyDrive.DA_project.helper_functions_colab import get_paths_for_en_episodes
from drive.MyDrive.DA_project.helper_functions_colab import dialogue_json_to_pandas
from statistics import mode
from tqdm import tqdm
import pickle
import time
from os import listdir

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 12.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 61.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 59.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=63a6e5c352

In [4]:
# set processing unit, GPU highly recommended for running this notebook
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [5]:
# Load DistilBert Classification model from transformer library
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1).to(device=device)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load saved bestBERT model op CPU (model was trained on GPU in Colab)
model.load_state_dict(torch.load('/content/drive/MyDrive/DA_project/DistilBERT_best_model.pt', map_location=device))
model.eval()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
# get dialogue csv paths
dialogue_paths = sorted(listdir('/content/drive/MyDrive/DA_project/podcast_data_no_audio/data'))
len(dialogue_paths)    

In [None]:
# create list with datasets (each dialogue has its own dataloader)
inference_sets = []
for dialogue_path in tqdm(dialogue_paths):
  #start_time = time.time()
  try:
    df = pd.read_csv('/content/drive/MyDrive/DA_project/podcast_data_no_audio/data/' + dialogue_path, sep='\t')
  except:
    continue  

  # skip short dialogues (which are usually monologues)
  if len(df) < 5:
    continue
  #start_time = time.time()
  # tokenize utterances in dialogue
  tokenized_utterances = df.text.apply(lambda x: tokenizer.encode(x,add_special_tokens=True))
  for index, utt in enumerate(tokenized_utterances):
    if len(utt) > 512:
      tokenized_utterances[index] = utt[:512]

  # pad sentences for in ference
  max_len = max(map(len,tokenized_utterances))
  padded_utterances = np.array([i+[0]*(max_len-len(i))  for i in tokenized_utterances])
  attention_masked_utterances = np.where(padded_utterances != 0,1,0)

  # Convert dialogue into tensors and dataset
  X_inference = torch.tensor(padded_utterances, device=device)
  X_inference_attention = torch.tensor(attention_masked_utterances, device=device)
  inference_dataset = TensorDataset(X_inference, X_inference_attention)
  inference_loader = DataLoader(inference_dataset, batch_size=16, shuffle=False)

  inference_sets.append((inference_dataset, inference_loader))

  #print("--- %s seconds ---" % (time.time() - start_time))
print('\nTotal dialogues preprocessed:', len(inference_sets))

In [None]:
def sigmoid(x):
  "Returns the sigmoid of the output logits"
  return 1 / (1 + np.exp(-x)) 

In [None]:
# Classify each utterance in dialogues
dialogue_sentiments = []
for dataset, loader in tqdm(inference_sets):
  preds = np.zeros([len(dataset), 1])
  for i, (x_batch, x_mask) in enumerate(loader):
      outputs = model(x_batch.to(device), attention_mask=x_mask.to(device))

      y_pred = sigmoid(outputs[0].detach().cpu().numpy())

      preds[i*16:(i+1)*16, :] = y_pred

  pred_labels = []
  for p in preds:
      if p > 0.5:
          pred_labels.append(1)
      else:
          pred_labels.append(0)

  dialogue_sentiments.append(pred_labels)        
        

# save results with Pickle into txt file
with open("/content/drive/MyDrive/DA_project/results.txt", "wb") as fp:   #Pickling
  pickle.dump(dialogue_sentiments, fp)