In [1]:
import csv
import re
import statistics
import random

In [2]:
# task-2 is a subset of task-1
with open('data/task-1/train.csv') as f:
    reader = csv.DictReader(f)
    rows = list(reader)
with open('data/task-1/dev.csv') as f:
    reader = csv.DictReader(f)
    dev_rows = list(reader)

In [33]:
def get_vocabulary(rows):
    vocab = set()
    for row in rows:
        sent = row['original'].replace('<', '').replace('/>', '')
        words = sent.split()
        vocab.update(words)
        vocab.add(row['edit'])
    return vocab

In [39]:
voc2 = get_vocabulary(dev_rows)

In [28]:
def get_edited_text(row):
    orig = row['original']
    edited = re.sub(r"<.+/>", row['edit'], orig)
    return edited

In [3]:
def get_std(row):
    grades = [int(x) for x in row['grades']]
    return statistics.pstdev(grades)

In [4]:
sorted_rows = sorted(rows, key=lambda x: (x['meanGrade'], -get_std(x)), reverse=True)

In [35]:
def get_subset_glove(rows):
    from tqdm import tqdm
    vocab = get_vocabulary(rows)
    with open('data/vocab.txt', 'w') as f:
        for word in vocab:
            f.write(word)
            f.write('\n')
    with open('glove.840B.300d.txt') as f, open('glove.trimmed.300d.txt', 'w') as of:
        for line in tqdm(f):
            word = line.split(' ', 1)[0]
            if word in vocab:
                of.write(line)

In [41]:
get_subset_glove(rows + dev_rows)

2196017it [00:13, 162644.94it/s]


In [31]:
random.shuffle(rows)
n = len(rows)
# train-test split
training_set = rows[:int(n*0.8)]
dev_set = rows[int(n*0.8):]

for fn, set_rows in zip(['data/training.csv', 'data/dev.csv'], [training_set, dev_set]):
    with open(fn, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=set_rows[0].keys())
        writer.writeheader()
        writer.writerows(set_rows)


In [42]:
def build_emb_index():
    import torch
    glove_embeddings = {}
    with open('data/glove.trimmed.300d.txt') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vec = torch.empty((300,), dtype=torch.float, requires_grad=False)
            for i in range(300):
                vec[i] = float(parts[i+1])
            glove_embeddings[word] = vec
    import pickle
    with open('data/glove.pkl', 'wb') as f:
        pickle.dump(glove_embeddings, f)

In [43]:
build_emb_index()

In [6]:
sorted_rows[:10]

[OrderedDict([('id', '3404'),
              ('original',
               "President Trump 's first year <anniversary/> report card , with grades from A + to F"),
              ('edit', 'Kindergarten'),
              ('grades', '33333'),
              ('meanGrade', '3.0')]),
 OrderedDict([('id', '1229'),
              ('original',
               "Recent Scandals Highlight Trump 's Chaotic <Management/> Style"),
              ('edit', 'Fashion'),
              ('grades', '33333'),
              ('meanGrade', '3.0')]),
 OrderedDict([('id', '1664'),
              ('original',
               'What if <Sociologists/> Had as Much Influence as Economists ?'),
              ('edit', 'donkeys'),
              ('grades', '33332'),
              ('meanGrade', '2.8')]),
 OrderedDict([('id', '9933'),
              ('original',
               'Mitch McConnell thinks tax reform will take longer than Trump <claimed/> '),
              ('edit', 'Haircut'),
              ('grades', '33332'),
             