## Environmental Considerations

In [1]:
# !pip install flair
# !pip install tqdm
# !ls -lh


import os
os.chdir("d:\\Thesis\\SemEval15\\SemEval-PIT2015-github\\data")
os.listdir()

['dev.data',
 'SemEval15DataFiles',
 'test.data',
 'test.label',
 'train.data',
 'tweetText.txt',
 'tweetTextFromDevData',
 'tweetTextFromDevData_parsed.txt',
 'tweetTextFromTestData',
 'tweetTextFromTestData_parsed.txt',
 'tweetTextFromTrainData',
 'tweetTextFromTrainData_parsed.txt',
 'tweetText_parsed.txt']

In [2]:
import tqdm
import json
import pandas as pd
import torch
import numpy as np

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, DocumentRNNEmbeddings, \
Sentence, ELMoEmbeddings, BertEmbeddings, BytePairEmbeddings


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## Lets load the data in

In [3]:
import urllib.request
import zipfile

baseDir = os.getcwd()
dataDir = baseDir + os.sep + "/SemEval15DataFiles/"
os.makedirs(dataDir, exist_ok=True)
os.chdir(dataDir)

url = 'https://github.com/upmangaurav/t2v/raw/master/data/data.zip'
print('Beginning to download the data file ...')
urllib.request.urlretrieve(url, dataDir + 'data.zip')

zip_ref = zipfile.ZipFile('data.zip', 'r')
zip_ref.extractall(dataDir)
print('Extracted downloaded data file...')
zip_ref.close()

!ls -lh
os.listdir()

Beginning to download the data file ...
Extracted downloaded data file...


'ls' is not recognized as an internal or external command,
operable program or batch file.


['data.zip', 'dev.data', 'test.data', 'test.label', 'train.data']

In [4]:
def make_List(file):
    debatableCount = 0
    with open(file, 'r') as f:
        td = f.readlines()

    dataList = []
    print("\nSize of Dataset", file, str(len(td)))
    for item in td:
        trainDict = {}
        splitsie = item.split('\t')

        #For training and dev data:
        if len(splitsie[4]) > 1:
            #debatable if only 2 turkers voted similar
            if splitsie[4][1] == '2':
                debatableCount += 1
                continue
        #Convert Label such as (3, 2) to decimal value like 0.6
            else: trainDict['Label'] = 0.2 * int(splitsie[4][1])

        else: # Test data:
            if splitsie[4] == '2':
                debatableCount += 1
                continue

            else: trainDict['Label'] = 0.2 * int(splitsie[4])

        
        trainDict['Topic_Id'] = splitsie[0]
        trainDict['Topic_Name'] = splitsie[1]
        trainDict['Sent_1'] = splitsie[2]
        trainDict['Sent_2'] = splitsie[3]

        dataList.append(trainDict)
    
    print("Debatable thus ignored tweet-combo count: ", debatableCount)
    print("Final dataset size:", len(dataList))
    return dataList 

## Data Exploration

In [5]:
trainList = make_List('train.data')
testList = make_List('test.data')
devList = make_List('dev.data')
trainList[:5]


Size of Dataset train.data 13063
Debatable thus ignored tweet-combo count:  1533
Final dataset size: 11530

Size of Dataset test.data 972
Debatable thus ignored tweet-combo count:  130
Final dataset size: 842

Size of Dataset dev.data 4727
Debatable thus ignored tweet-combo count:  585
Final dataset size: 4142


[{'Label': 1.0,
  'Topic_Id': '4',
  'Topic_Name': '1st QB',
  'Sent_1': 'EJ Manuel the 1st QB to go in this draft',
  'Sent_2': 'But my bro from the 757 EJ Manuel is the 1st QB gone'},
 {'Label': 1.0,
  'Topic_Id': '4',
  'Topic_Name': '1st QB',
  'Sent_1': 'EJ Manuel the 1st QB to go in this draft',
  'Sent_2': 'Can believe EJ Manuel went as the 1st QB in the draft'},
 {'Label': 0.6000000000000001,
  'Topic_Id': '4',
  'Topic_Name': '1st QB',
  'Sent_1': 'EJ Manuel the 1st QB to go in this draft',
  'Sent_2': 'EJ MANUEL IS THE 1ST QB what'},
 {'Label': 0.8,
  'Topic_Id': '4',
  'Topic_Name': '1st QB',
  'Sent_1': 'EJ Manuel the 1st QB to go in this draft',
  'Sent_2': 'Manuel is the 1st QB to get drafted'},
 {'Label': 1.0,
  'Topic_Id': '4',
  'Topic_Name': '1st QB',
  'Sent_1': 'EJ Manuel the 1st QB to go in this draft',
  'Sent_2': 'My boy EJ Manuel being the 1st QB picked'}]

# Initiate Embeddings

In [6]:
print(torch.cuda.is_available())
torch.cuda.get_device_name(0)

# If need to run on CPU:
# import flair, torch
# flair.device = torch.device('cpu') 

True


'GeForce GTX 1050 Ti'

In [7]:
def embeddingManager(word = True, flair = False, BERT = False, ELMO = False, bytePair = False, etype = 'pool', \
                    rnnType = 'GRU'):
    '''
    Parameters:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
    
    :flair: Forward/Backward combos of multi, multi-fast, news, news-fast and mix
    e.g. 'multi-forward'+'multi-backward', 'news-forward-fast'+'news-backward-fast' etc
    
    :word: glove, twitter, crawl etc
    :BERT: 'bert-base-uncased', 'bert-large-uncased', 'bert-base-cased' and 'bert-large-cased'
    :ELMO: 'small', 'medium' and 'original'
    :etype: 'pool' or 'RNN'
    :rnnType: 'GRU', 'LSTM',  'RNN_TANH' or 'RNN_RELU'
    
    '''

    embeddings = []
    
    if word:
        print("word called up!!")
        if word == True: # Default case
            embeddings.append(WordEmbeddings('twitter'))
        else:
            embeddings.append(WordEmbeddings(word))

    if bytePair:
        embeddings.append(BytePairEmbeddings('en'))
    
    if flair:
        if flair == True:
            print("Flair called up!!")
            flair = 'mix' # Default flavour is mix-forward and mix-backward
        
        flair = [i.lower() for i in flair.split('-') if i.lower() != 'forward' and i.lower() != 'backward']
        if len(flair) > 1:
            embeddings.append(FlairEmbeddings(flair[0] + '-forward-' + (flair[1])))
            embeddings.append(FlairEmbeddings(flair[0] + '-backward-' + (flair[1])))
        else:
            embeddings.append(FlairEmbeddings(flair[0] + '-forward'))
            embeddings.append(FlairEmbeddings(flair[0] + '-backward'))
            
    if BERT:
        if BERT == True: # Default flavour
            print("Bert called up!!")
            embeddings.append(BertEmbeddings('bert-base-cased'))
        else: 
            embeddings.append(BertEmbeddings(BERT))
    if ELMO:
        if ELMO == True: # Default flavour
            embeddings.append(ELMoEmbeddings())
        else:
            embeddings.append(ELMoEmbeddings(ELMO))
    #
    
    if etype == 'RNN':
        document_embeddings = DocumentRNNEmbeddings(embeddings, rnn_type = rnnType, hidden_size = 400)
    else:
        document_embeddings = DocumentPoolEmbeddings(embeddings)        
    return document_embeddings

In [8]:
# # Some HIMYM sentence
# sentence = Sentence('Lets go to the mall, Today! - Robin Sparkles')
# # embed words in sentence 
# embeddingManager().embed(sentence)
# for token in sentence:
#     print(token.embedding)
# # data type and size of embedding 
# print(type(token.embedding))
# # storing size (length) *2 because there'll be concatenation of diff and mult vectors
# n = token.embedding.size()[0] * 2
# n

In [9]:
def generateBinaryLabels(dataList, binaryOrNot):
    if binaryOrNot:
        dataLabels = []
        for couple in dataList:
            if couple['Label'] >= 0.6:
                dataLabels.append(1)
            elif couple['Label'] <= 0.2:
                dataLabels.append(0)
            else:
                print(couple['Label'])
        print("No of labels: ", len(dataLabels))
    return dataLabels

In [10]:
def generateEmbeddings(someList, **kwargs):
    from tqdm import tqdm
    
    document_embeddings = embeddingManager(**kwargs)
    
    sentence = Sentence('Lets go to the mall, Today! - Robin Sparkles')

    # embed words in sentence 
    document_embeddings.embed(sentence)

    n = sentence.embedding.size()[0] * 2
    s = torch.zeros(0,n)
    print(s.size())
    print(n)
    for i in tqdm(range(len(someList))):

        # retrieve the text sentence
        sentence1 = Sentence(someList[i]['Sent_1'])
        sentence2 = Sentence(someList[i]['Sent_2'])

        # embed the sentences with our document embedding
        document_embeddings.embed(sentence1)
        document_embeddings.embed(sentence2)

        # Calculate the element-wise product
        productTensor = sentence1.get_embedding() * sentence2.get_embedding()
        # Calculate the difference
        absDiffTensor = torch.abs(sentence1.get_embedding() - sentence2.get_embedding())

        # Add the concatenated vector as final embedding
        embeddingVector = torch.cat([productTensor, absDiffTensor])
        
        # Adding Document embeddings to list #
        s = torch.cat((s, embeddingVector.view(-1,n)),0)
        
#         torch.cuda.empty_cache()
    
    return s

In [11]:
def masterFunction(dataList, binaryOrNot, **kwargs):
    rawList = make_List(dataList)
    labelList = generateBinaryLabels(rawList, binaryOrNot)
    embeddingList = generateEmbeddings(rawList, **kwargs)
    
    # A litttle reformatting to make the data and labels model-friendly
    labelList = np.array(labelList).reshape(len(labelList), -1)
    embeddingList = embeddingList.detach().numpy()
    
    return embeddingList, labelList

# Training begins

### embeddingManager function Parameters for reference

    :flair: multi, multi-fast, news, news-fast or mix
    To generate Forward/Backward combos e.g. 'multi-forward'+'multi-backward', 'news-forward-fast'+'news-backward-fast' etc

    :word: glove, twitter, crawl etc
    :BERT: 'bert-base-uncased', 'bert-large-uncased', 'bert-base-cased' and 'bert-large-cased'
    :ELMO: 'small', 'medium' and 'original'
    :etype: 'pool' or 'RNN'
    :rnnType: 'GRU', 'LSTM',  'RNN_TANH' or 'RNN_RELU'

    reference: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md


In [12]:
binThresHold = 0.075
def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = (preds >= binThresHold).astype(np.int)
    return [('f1_score', f1_score(labels, preds))]

In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Only For parameter reference:
embeddingManager()

embeddingsArgs = {
    'flair': True,
#     'BERT': True,
#       'etype':'RNN',
#       'rnnType': 'LSTM',
}

x_train, y_train = masterFunction('train.data', True, **embeddingsArgs)
x_valid, y_valid = masterFunction('dev.data', True, **embeddingsArgs)

### XGBoost compatible data ###
dtrain = xgb.DMatrix(x_train,y_train)
dvalid = xgb.DMatrix(x_valid, label = y_valid)

### defining parameters ###
params = {
          'colsample': 0.9,
          'colsample_bytree': 0.5,
          'eta': 0.1,
          'max_depth': 8,
          'min_child_weight': 6,
          'objective': 'binary:logistic',
          'subsample': 0.9
          }

### Training the model ###
xgb_model = xgb.train(
                      params,
                      dtrain,
                      feval= custom_eval,
                      num_boost_round= 1000,
                      maximize=True,
                      evals=[(dvalid, "Validation")],
                      early_stopping_rounds=30
                      )


word called up!!

Size of Dataset train.data 13063
Debatable thus ignored tweet-combo count:  1533
Final dataset size: 11530
No of labels:  11530
word called up!!
Flair called up!!
torch.Size([0, 8392])
8392


  1%|▎                         | 152/11530 [00:22<34:51,  5.44it/s]

KeyboardInterrupt: 

## Testing

In [None]:
### Reformatting test set for XGB ###
x_test, y_test = masterFunction('test.data', True, **embeddingsArgs)

dtest = xgb.DMatrix(x_test)

### Predicting ###
predict = xgb_model.predict(dtest) # predicting

In [None]:
print(predict[198])
print(y_test[198])

In [None]:
from sklearn.metrics import precision_recall_fscore_support

y_pred = np.array([1 if (predict[i] >= binThresHold) else 0 for i in range(len(predict))])
y_true = np.array(y_test)
score = precision_recall_fscore_support(y_true, y_pred, average='macro')
score