__Objective__: Calculate error bands for estimates infered from Bert on CreateDebate dataset

__Runtime__: GPU

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!pip install transformers 
!pip install datasets

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm 
import pickle as pkl
import matplotlib.pyplot as plt
from matplotlib import colors

import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from datasets import load_metric

# Loading CMV dataset

In [None]:
def read_dataset(dir):
    """Reading texts and labels from dataset"""
    texts_labels = []
    with open(dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        for line in lines:
            line = line.strip().split(",")
            assert(len(line) == 2)
            label = line[0].strip()
            text = line[1].strip()
            if label == 'AH':
                texts_labels.append((text, 1))
            else:
                texts_labels.append((text, 0))
    return texts_labels

In [None]:
dataset = read_dataset('/content/gdrive/MyDrive/DL/dataset/pytorch/train.csv')
dataset.extend(read_dataset('/content/gdrive/MyDrive/DL/dataset/pytorch/test.csv'))

# Training models

In [None]:
def split_and_merge(dataset):
    ah = []
    none = [] 
    for ctext, clabel in dataset:
        if (clabel):
            ah.append((ctext, clabel)) 
        else:
            none.append((ctext, clabel)) 
    i = 0 
    j = 0 
    new_dataset = []
    while (i < len(ah) and j < len(none)):
        new_dataset.append(ah[i]) 
        new_dataset.append(none[j])
        i += 1
        j += 1
    while (i < len(ah)):
        new_dataset.append(ah[i]) 
        i += 1 
    while (j < len(none)): 
        new_dataset.append(none[j]) 
        j += 1 
    return new_dataset

In [None]:
# shuffling the dataset 
dataset = split_and_merge(dataset)

# creating folds 
n_folds = 10
fold_length = len(dataset) // n_folds
folds = [dataset[i * fold_length: (i + 1) * fold_length] for i in range(n_folds)]

In [None]:
# creating tokenizer to get encodings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
max_seq_length = 64

In [None]:
folds_text = [] 
folds_label = [] 
for i in range(n_folds):
    texts = []
    labels = [] 
    for ftext, flabel in folds[i]:
        texts.append(ftext)
        labels.append(flabel)
    folds_text.append(texts)
    folds_label.append(labels) 

In [None]:
# generating encodings
folds_encoding = [] 
for i in range(n_folds):
    folds_encoding.append(tokenizer(folds_text[i], truncation=True, max_length=max_seq_length, padding="max_length"))

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = []
test_dataset = [] 

for i in tqdm(range(n_folds)):
    cur_text = [] 
    cur_label = [] 
    for j in range(n_folds):
        if i == j:
            continue 
        cur_text.extend(folds_text[j]) 
        cur_label.extend(folds_label[j]) 
    cur_encoding = tokenizer(cur_text, truncation=True, max_length=max_seq_length, padding="max_length")
    train_dataset.append(CustomDataset(cur_encoding, cur_label)) 
    test_dataset.append(CustomDataset(folds_encoding[i], folds_label[i]))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def get_model(fold_id):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset[fold_id], batch_size=64, shuffle=True)
    optim = AdamW(model.parameters(), lr=5e-5)

    for epoch in range(3):
        for batch in tqdm(train_loader):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optim.step()
    
    return model

# Loading CreateDebate dataset

In [None]:
!git clone https://github.com/utkarsh512/CreateDebateScraper.git

In [None]:
%cd CreateDebateScraper/src/nested/

In [None]:
from thread import Thread, Comment # from CreateDebateScraper
import pickle
from copy import deepcopy

In [None]:
comments = dict()

# topical forums on CreateDebate
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# topical forums we're interested in!
categories_selected = ['politics2', 'religion', 'world', 'science', 'law', 'technology']
categories_labels = ['politics', 'religion', 'world', 'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# loading dataset from drive for interesting topical forums
for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()
    #print(f'{cat} - {len(threads)}')
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append(v)
            except:
                authors[v.author] = list()
                authors[v.author].append(v)
    ctr = 0
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            #foo['score'] = cws[ctr][0]
            #foo['validation'] = cws[ctr][1][0]
            comments[cat].append(foo)
            ctr += 1

In [None]:
comments['politics2'][0].keys()

In [None]:
texts = list()

for cat in categories_selected:
    for comment in comments[cat]:
        texts.append(comment['body'])

# Infering class labels

In [None]:
class InferDataset(torch.utils.data.Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __getitem__(self, idx):
        return self.texts[idx]

    def __len__(self):
        return len(self.texts)

In [None]:
in_data = InferDataset(texts)

In [None]:
model = get_model(fold_id=9)

In [None]:
pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, device=0)

In [None]:
def classify(text, **kwargs):
    return pipe(text, max_length=64, truncation=True, **kwargs)

In [None]:
lbl = list()
for out in tqdm(classify(in_data, batch_size=128), total=len(in_data)):
    lbl.append(out)
with open('/content/gdrive/MyDrive/DL/CreateDebate/errorband/label9.log', 'wb') as f:
    pickle.dump(lbl, f)

In [None]:
len(lbl)