In [1]:
import pandas as pd
import numpy as np
import nltk
import torch.nn as nn
import torch
import torch.functional as F
import torch.optim as optim

In [24]:
df = pd.read_csv("./OLID/olid-training-v1.0.tsv", sep="\t")
df['subtask_b'][df['subtask_b']=='UNT']=0
df['subtask_b'][df['subtask_b']=='TIN']=1
df = df[pd.notnull(df['subtask_b'])]
df['subtask_c']=df['tweet']
df['tweet']=df['subtask_b']
df=df.drop(['subtask_b'],axis=1)
df=df.rename({"tweet": "label","subtask_a":"alpha","subtask_c":"text"},axis=1) 
df['alpha']='a'
df=df.reset_index(drop=True)

In [3]:
df.to_csv("./OLID/train.tsv",sep='\t',header=False,index=False)

In [4]:
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForPreTraining
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *
import convert_examples_to_features

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
DATA_DIR = "./OLID"


BERT_MODEL = 'bert-base-uncased'


TASK_NAME = 'taskb'


OUTPUT_DIR = f'outputs/{TASK_NAME}/'


REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'


CACHE_DIR = './uncased_bert/'


MAX_SEQ_LENGTH = 50

TRAIN_BATCH_SIZE = 40
TEST_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
WARMUP_PROPORTION = 0.1
#OUTPUT_MODE = 'classification'




In [25]:
import emoji 
import wordsegment
import codecs
wordsegment.load()

fp = codecs.open("./OLID/olid-training-v1.0.tsv", "r", encoding='utf-8', errors='ignore')
data= fp.read()
examples=str(data)
examples = emoji.demojize(examples)

train = examples.split('\n')[:int(0.8*len(examples.split('\n')))]
test = examples.split('\n')[int(0.8*len(examples.split('\n'))):]
train_examples = []
for i in range(1,len(train) - 1):
  
    x = train[i].split('\t')

    x[1] = wordsegment.segment(x[1])
    s = ""
    for j in x[1]:
        s = s + " " + j
        x[1] = s
    
    print(x)
    train_examples.append(x)

test_examples = []
for i in range(1,len(test) - 1):
    
    x = test[i].split('\t')

    x[1] = wordsegment.segment(x[1])
    s = ""
    for j in x[1]:
        s = s + " " + j
        x[1] = s
    
    print(x)
    test_examples.append(x)

In [26]:
processor = BinaryClassificationProcessor()

train_examples_len = len(train_examples)

label_list = processor.get_labels()
num_labels = len(label_list)


num_train_optimization_steps = int(train_examples_len / TRAIN_BATCH_SIZE) * NUM_TRAIN_EPOCHS



tokenizer = BertTokenizer.from_pretrained('bert-base-uncased','./uncased_bert/')

label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer) for example in train_examples]



test_examples_len = len(test_examples)
test_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH*2, tokenizer) for example in test_examples]

In [14]:
process_count=cpu_count()-1

In [27]:
with Pool(process_count) as p:
    train_features = list(p.imap(convert_examples_to_features.convert_example_to_feature, train_examples_for_processing))
    
with Pool(process_count) as p:
    test_features = list(p.imap(convert_examples_to_features.convert_example_to_feature, test_examples_for_processing))

In [28]:
bert = BertModel.from_pretrained(BERT_MODEL,cache_dir='./uncased_bert')
bert.to(device)

In [18]:
class cnn(nn.Module):
    def __init__(self):
        super(cnn, self).__init__()
        self.layer1=nn.Sequential(nn.Conv2d(768, 200, kernel_size=1, padding=0), nn.Softmax(dim=3))
        self.layer2=nn.Sequential(nn.Conv2d(768, 200, kernel_size=2, padding=1), nn.Softmax(dim=3))
        self.layer3=nn.Sequential(nn.Conv2d(768, 200, kernel_size=3, padding=1), nn.Softmax(dim=3))
        self.layer4=nn.Sequential(nn.Conv2d(768, 200, kernel_size=4, padding=2), nn.Softmax(dim=3))
        
        #self.dropout1=nn.Dropout(0.1)
        self.lin1=nn.Sequential(nn.Linear(800,128),nn.Tanh())
        #self.dropout2=nn.Dropout(0.2)
        self.lin2=nn.Sequential(nn.Linear(128,2),nn.Tanh())
        
    def forward(self, x):
        out1 = self.layer1(x)
        
        out2 = self.layer2(x)[:,:,:-1,:-1]
        
        out3 = self.layer3(x)
        
        out4 = self.layer4(x)[:,:,:-1,:-1]
       
        out5=torch.cat((out1,out2,out3,out4),dim=1)

        out5=out5.permute(0,3,2,1)  

        out6=self.lin1(out5)

        out7=self.lin2(out6)
    
        out7=out7.permute(0,3,2,1)

        out8=torch.sum(out7,dim=3)
        
        return out8

In [10]:
CNN=cnn()

In [19]:
optimizer = optim.Adam(list(bert.parameters()) + list(CNN.parameters()), lr=1e-4)

In [29]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

In [30]:
all_input_ids_test = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_input_mask_test = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
all_segment_ids_test = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
all_label_ids_test = torch.tensor([f.label_id for f in test_features], dtype=torch.long)

In [31]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [None]:
test_data = TensorDataset(all_input_ids_test, all_input_mask_test, all_segment_ids_test, all_label_ids_test)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=TEST_BATCH_SIZE)

In [32]:
model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):

    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        bert = BertModel.from_pretrained(BERT_MODEL,cache_dir='./uncased_bert')
        
        
        preprocessed = bert(input_ids, segment_ids, input_mask)
        inpu=torch.stack(preprocessed[0],axis=0)
        inpu=inpu.permute(1,3,2,0)
        
        CNN=cnn()
        outp=CNN(inpu)

        x=outp.sum(axis=2)
        #         print(outp.sum(axis=1))
#         print(outp.sum(axis=1).shape)
#         break
#     break
#         outp[:,1,]
#         x=np.array([outp[:,1,:],outp[:,0,:]])
        
        
        
        
        loss_fct = CrossEntropyLoss()
#             print(label_ids.view(-1))
#             print(type(x))
#             print(type(label_ids))
#             print(x.shape)
#             print(type(x.view(-1,2)))
        loss = loss_fct(x, label_ids.view(-1))
        
        loss.backward()
        print("\r%f" % loss, end='')
        
        optimizer.step()
        optimizer.zero_grad()

In [None]:
model.eval()
eval_loss = 0
nb_eval_steps = 0

for input_ids_test, input_mask_test, segment_ids_test, label_ids_test in tqdm_notebook(test_dataloader, desc="Evaluating"):
    input_ids = input_ids_test.to(device)
    input_mask = input_mask_test.to(device)
    segment_ids = segment_ids_test.to(device)
    label_ids = label_ids_test.to(device)
    
    
    with torch.no_grad():
        bert = BertModel.from_pretrained(BERT_MODEL,cache_dir='./uncased_bert')
        preprocessed = bert(input_ids, segment_ids, input_mask)
        inpu=torch.stack(preprocessed[0],axis=0)
        inpu=inpu.permute(1,3,2,0)
        CNN=cnn()
        outp=CNN(inpu)
        x=outp.sum(axis=2)
        
        
    loss_fct = CrossEntropyLoss()
    tmp_eval_loss = loss_fct(x, label_ids.view(-1))
    

    eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps