In [1]:
import os
import re
import sys
import pathlib
import numpy as np
from csv import reader
from stanfordcorenlp import StanfordCoreNLP
#from pytorch_pretrained_bert.tokenization import BertTokenizer

In [18]:
def pre_process_text(text):
    '''
    Lowercase, TOkenize (Stanford CoreNLP)
    '''
    text = text.lower()
    #result = nlp.word_tokenize(text)
    result = text.split(" ")
    return result


def bert_embedding(text):
    '''
    Get bert tokenized sentences
    '''
    return tokenizer.encode(tokenizer.tokenize(text))


def glove_cove_embedding(text):
    '''
    Get Glove_Cove Embedding
    '''
    #tokens = [nlp.word_tokenize(sentence) for sentence in text]
    tokens = [sentence.split(" ") for sentence in text]
    glove_embed = glove_model.encode(tokens)
    cove_embed = cove_model.encode(tokens)
    result = np.concatenate(glove_embed,cove_embed, axis=2)
    return result




def load_data(folder= "Subtask-A", filename="SubtaskA_EvaluationData_labeled.csv"):
    '''
    Args : folder name, file name
    Ret : return data loaded into list of lists [id, string, labels] 
    '''
    #prefix = str(pathlib.Path(__file__).parent.parent)
    #path = os.path.join(prefix,"data", folder, filename)
    path = "C:\\Users\\bhara\\Downloads\\NNNlpHW3\\suggestionMining\\data\\Subtask-A\\SubtaskA_EvaluationData.csv"
    f = open(path,'r', encoding="utf-8")
    data_reader = reader(f, delimiter=",")
    data = [row for row in data_reader]
    f.close()
    return data


def pre_process_data_from_dataset(data):
    '''
    Args : data is list of lists [id, string, label]
    Ret: list of features, labels, id_map (i.e index to id mapping)
    '''
    ids = [ datum[0]+",\""+datum[1]+"\"" for datum in data]
    id_map = {k:v for k,v in enumerate(ids)}

    labels = [datum[2] for datum in data]

    bert_feats = [bert_embedding(datum[1]) for datum in data]
    glove_cove_feats = glove_cove_embedding([datum[1] for datum in data])
    return glove_cove_feats, bert_feats, labels, id_map


def create_folds(data, folds=10):
    '''
    Split data into 'folds' number of batches

    Args : data, list of lists of form [id, string, label]
            folds, number of batches of data to be created
    Rets : data batched into 'fold' lists, each wich is in turn list of lists [id, string, label]
    '''
    data_size = len(data)
    batch_size = int(data_size/folds)
    data_batch = []
    last_index = 0
    for i in range(folds-1):
        batch = data[i * batch_size: (i+1)*batch_size]
        data_batch.append(batch)
    data_batch.append(data[(folds-1) * batch_size:])
    return data_batch


def create_cross_val_train_test(data_batches,id, folds=10):
    '''
    Create test set from batched data, where test set while batch[id]
    and train set will everything else

    Args : data_batches, data that is split into 'fold' number of groups
            id, index of batch to be made as test_set
            folds, number of batches the data is split into
    Rets :
    '''
    train = []
    test = data_batches[id]
    for i in range(folds):
        if i != id:
            train += data_batches[i]
    return train, test

In [20]:
data = load_data()
data_folds = create_folds(data)
for datum in data_folds:
    print(len(datum))
print(data_folds[0][0])
train, test = create_cross_val_train_test(data_folds,0)
print("Length of train and test: ", len(train),len(test))

#feats, bert, labels, id_map = pre_process_data_from_dataset(train)
#print(feats[0])
#print(bert[0])

83
83
83
83
83
83
83
83
83
86
['9566', 'This would enable live traffic aware apps.', 'X']
Length of train and test:  750 83


In [21]:
data

[['9566', 'This would enable live traffic aware apps.', 'X'],
 ['9569',
  'Please try other formatting like bold italics shadow to distinguish titles/subtitles from content.',
  'X'],
 ['9576',
  'Since computers were invented to save time I suggest we be allowed to upload them all in one zip file - using numbering for the file names and the portal could place them in the right order.',
  'X'],
 ['9577', 'Allow rearranging if the user wants to change them!', 'X'],
 ['9579',
  'Add SIMD instructions for better use of ARM NEON instructions for math and games.',
  'X'],
 ['9580',
  'Also using a hot swapping code generator (optimized machine code like HotSpot in Java does) would also be very helpful to mitigate slow C# code.',
  'X'],
 ['9587',
  'Microsoft should seriously look into getting rid of Symantec for all these payment stuff.',
  'X'],
 ['9599',
  'I would be extremely useful in a variety of app types to be notified upon call started and ended events at least be able to check wh

In [1]:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

from cove import MTLSTM


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [2]:
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
#LABEL = data.Field(lower=True, include_lengths=True, batch_first=True)


In [None]:
LABEL = data.Field(sequential=False, use_vocab=False)

TEXT_BERT = data.Field(
    use_vocab=False,
    batch_first=True,
    pad_token=tokenizer.pad_token_id,
    tokenize=tokenizer.encode
)

In [7]:
train_path = "C:\\Users\\bhara\\Downloads\\NNNlpHW3\\suggestionMining\\data\\Subtask-A\\V1.4_Training.csv"
"""
train = data.TabularDataset(
        path=train_path, format='csv',
        fields=[('id', None),
                ('sentence', TEXT),
                 ('label', LABEL)])
"""

In [None]:
train = data.TabularDataset(
        path=train_path, format='csv',
        skip_header = False,
        fields={'sentence':[('sentence',TEXT),('bert_enc',TEXT_BERT)],
                'label':('label',LABEL)
                })

In [34]:
#TEXT.vocab.freqs

Counter({'"please': 226,
         'enable': 47,
         'removing': 14,
         'language': 61,
         'code': 156,
         'from': 628,
         'the': 7249,
         'dev': 97,
         'center': 63,
         '"language': 3,
         'history"': 2,
         'for': 1875,
         'example': 119,
         'if': 673,
         'you': 845,
         'ever': 20,
         'selected': 41,
         '"ru"': 18,
         'and': 2981,
         '"ru-ru"': 6,
         'laguages': 4,
         'published': 31,
         'this': 1033,
         'xap': 18,
         'to': 5973,
         'store': 255,
         'then': 247,
         'it': 1625,
         'causes': 27,
         'tile': 61,
         'localization': 14,
         'show': 118,
         'en-us(default)': 4,
         'which': 383,
         'is': 2309,
         'bad."': 4,
         '"note:': 4,
         'in': 2239,
         'your': 219,
         '.csproj': 1,
         'file,': 11,
         'there': 314,
         'a': 3296,
         'supportedcu

In [8]:
TEXT.build_vocab(train, vectors=GloVe(name='840B', dim=300, cache='.embeddings'))
#LABEL.build_vocab(train)
outputs_cove_with_glove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, residual_embeddings=True, model_cache='.embeddings')
#glove_then_first_then_last_layer_cove = outputs_both_layer_cove_with_glove(<pass a sentence Glove embedding>)

In [15]:
train_iter = data.Iterator(
    (train),
    batch_size=100)

In [33]:
z.sentence[0].vectors

tensor([[ 830,   21,   54,  ...,    1,    1,    1],
        [ 772,   46,    8,  ...,    1,    1,    1],
        [  50,    6,  113,  ...,    1,    1,    1],
        ...,
        [  27,  146,   55,  ...,    1,    1,    1],
        [1148,  206,  192,  ...,    1,    1,    1],
        [ 950,   49,   10,  ...,    1,    1,    1]])

In [24]:
z = None
for batch_idx, batch in enumerate(train_iter):
    z = batch
    glove_then_last_layer_cove = outputs_cove_with_glove(*batch.sentence)
    print(glove_then_last_layer_cove.size())

torch.Size([100, 72, 900])
torch.Size([100, 75, 900])
torch.Size([100, 85, 900])
torch.Size([100, 79, 900])
torch.Size([100, 50, 900])
torch.Size([100, 57, 900])
torch.Size([100, 50, 900])


KeyboardInterrupt: 

In [29]:
z.sentence[0]

tensor([[ 830,   21,   54,  ...,    1,    1,    1],
        [ 772,   46,    8,  ...,    1,    1,    1],
        [  50,    6,  113,  ...,    1,    1,    1],
        ...,
        [  27,  146,   55,  ...,    1,    1,    1],
        [1148,  206,  192,  ...,    1,    1,    1],
        [ 950,   49,   10,  ...,    1,    1,    1]])

In [24]:
train.examples[0]

<torchtext.data.example.Example at 0x23587e9b9e8>

In [18]:
z