In [1]:
%matplotlib inline
import csv
import logging
import os
import random
import sys
import pickle
import time
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json

In [2]:
# train_file = '../resources/msgs/main_verb_absolute_token_position/train.jsonl'
# test_file = '../resources/msgs/main_verb_absolute_token_position/test.jsonl'
# inoc_file = '../resources/msgs/main_verb_absolute_token_position/inoculating.jsonl'

train_file = '../resources/msgs/syntactic_category_relative_position/train.jsonl'
test_file = '../resources/msgs/syntactic_category_relative_position/test.jsonl'
inoc_file = '../resources/msgs/syntactic_category_relative_position/inoculating.jsonl'

In [3]:
def process_jsonl(filename):
    with open(filename, 'r') as json_file:
        json_list = list(json_file)
    return_raw_data = []
    for json_str in json_list:
        result = json.loads(json_str)
        return_raw_data.append(result)
    return return_raw_data

In [4]:
raw_train_data = process_jsonl(train_file)
raw_dev_test_data = process_jsonl(test_file)

raw_dev_data = raw_dev_test_data[:int(len(raw_dev_test_data)/2)]
raw_test_data = raw_dev_test_data[int(len(raw_dev_test_data)/2):]

raw_inoc_data = process_jsonl(inoc_file)

In [5]:
raw_train_data[0]

{'sentence': 'The dentist is singing some melody and every sick niece is in a cafe.',
 'condition': 'training',
 'linguistic_feature_label': 1,
 'surface_feature_label': 1,
 'UID': 'syntactic_category_relative_position',
 'linguistic_feature_type': 'syntactic',
 'linguistic_feature_description': 'Is there an adjective present?',
 'surface_feature_type': 'relative_position',
 'surface_feature_description': "Does the word 'the' precede the word 'a'?",
 'control_paradigm': False,
 'sentenceID': 40000,
 'paradigmID': 5000,
 'split': 'train'}

In [6]:
max([len(d['sentence'].split()) for d in raw_train_data]), max([len(d['sentence'].split()) for d in raw_dev_test_data])

(18, 19)

In [7]:
def create_train_dataset(raw_train_data, raw_inoc_data, inoc_rate, seed, if_full_train_set=False):
    random.seed(seed)
    train_data = []
    buffer = []
    for d in raw_train_data:
        if if_full_train_set:
            train_data.append((d['sentence'], d['linguistic_feature_label'], [d['linguistic_feature_label'], d['surface_feature_label']]))
            continue
        if len(buffer) < 2:
            buffer.append(d)
        if len(buffer) == 2:
            _d = random.choice(buffer)
            train_data.append((_d['sentence'], _d['linguistic_feature_label'], [_d['linguistic_feature_label'], _d['surface_feature_label']]))
            buffer = []

    random.seed(seed + 1)
    inoc_data = []
    buffer = []
    for d in raw_inoc_data:
        if if_full_train_set:
            inoc_data.append((d['sentence'], d['linguistic_feature_label'], [d['linguistic_feature_label'], d['surface_feature_label']]))
            continue
        if len(buffer) < 2:
            buffer.append(d)
        if len(buffer) == 2:
            _d = random.choice(buffer)
            inoc_data.append((_d['sentence'], _d['linguistic_feature_label'], [_d['linguistic_feature_label'], _d['surface_feature_label']]))
            buffer = []

    num_inoc_ex = int(len(train_data) * inoc_rate)
    train_data = train_data[:-num_inoc_ex] + inoc_data[-num_inoc_ex:]

    random.shuffle(train_data)
    return train_data

In [8]:
def create_test_dataset(raw_data, seed):
    random.seed(seed)
    dataset = []
    for d in raw_data:
        dataset.append((d['sentence'], d['linguistic_feature_label'], [d['linguistic_feature_label'], d['surface_feature_label']]))
    random.shuffle(dataset)
    return dataset

In [9]:
for inoc_rate in [0.003, 0.01, 0.03]:
    dir_name = f"msgs_half_{inoc_rate}/"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    train_data = create_train_dataset(raw_train_data, raw_inoc_data, inoc_rate=inoc_rate, seed=2021)
    dev_data = create_test_dataset(raw_dev_data, seed=2023) # 66.7% core-spur correlation in dev and test
    test_data = create_test_dataset(raw_test_data, seed=2024)
    pickle.dump(train_data, open(f"{dir_name}msgs_train.pkl", 'wb'))
    pickle.dump(dev_data, open(f"{dir_name}msgs_dev.pkl", 'wb'))
    pickle.dump(test_data, open(f"{dir_name}msgs_test.pkl", 'wb'))

In [10]:
for inoc_rate in [0.003, 0.01, 0.03]:
    dir_name = f"msgs_full_{inoc_rate}/"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    train_data = create_train_dataset(raw_train_data, raw_inoc_data, inoc_rate=inoc_rate, seed=2021, if_full_train_set=True)
    dev_data = create_test_dataset(raw_dev_data, seed=2023) # 66.7% core-spur correlation in dev and test
    test_data = create_test_dataset(raw_test_data, seed=2024)
    pickle.dump(train_data, open(f"{dir_name}msgs_train.pkl", 'wb'))
    pickle.dump(dev_data, open(f"{dir_name}msgs_dev.pkl", 'wb'))
    pickle.dump(test_data, open(f"{dir_name}msgs_test.pkl", 'wb'))

In [11]:
print(len(train_data), len(dev_data), len(test_data))

10000 15000 15000
