### Generate Data --- MODE_2nd

In [1]:
import os, random, json, re
import pandas as pd
import numpy as np
from tqdm import trange, tqdm
import numpy as np
from collections import Counter
TASK = "mode_2nd"

In [14]:
# uniform, uniform_hard, uniform_hard+
samples_per_num_digit = 1000
data = {}
for num_digit in trange(8, 128):
    unique_buckets = set()
    strings = []
    while len(strings) < samples_per_num_digit:
        partition = sorted(random.sample(list(range(1, num_digit+10)), 9))
        prev = 0
        buckets = []
        for b in partition:
            buckets.append(b-prev-1)
            prev = b
        buckets.append(num_digit+10-prev-1)
        s = "".join([str(d)*n for n, d in zip(buckets, list(range(10)))])

        try: 
            counts = Counter(s).most_common()
            if counts[0][1] == counts[1][1] or counts[1][1] == counts[2][1]: continue # to ensure mode is unique
            #if counts[0][1] - counts[2][1] > 2: continue # to ensure the most_common frequency - the third_common frequency <= 10
        except IndexError: continue # to ensure there are at least 2 distinct digits

        if "".join([str(x) for x in buckets]) not in unique_buckets:
            unique_buckets.add("".join([str(x) for x in buckets]))
            strings.append("".join(random.sample(s, len(s))))

    data[num_digit] = strings

json.dump(data, open(f"../data/finetune/{TASK}/finetune.json", "w"), indent=2)
            


100%|██████████| 120/120 [00:05<00:00, 23.49it/s]


In [8]:
# length_extrapolation
samples_per_num_digit = 1000
data = {}
for num_digit in trange(129, 257):
    unique_buckets = set()
    strings = []
    while len(strings) < samples_per_num_digit:
        partition = sorted(random.sample(list(range(1, num_digit+10)), 9))
        prev = 0
        buckets = []
        for b in partition:
            buckets.append(b-prev-1)
            prev = b
        buckets.append(num_digit+10-prev-1)
        s = "".join([str(d)*n for n, d in zip(buckets, list(range(10)))])

        try: 
            most_common1, most_common2, most_common3 = Counter(s).most_common(3)
            if most_common1[1] == most_common2[1] or most_common2[1] == most_common3[1]: continue # to ensure mode is unique
        except ValueError: continue # to ensure there are at least 2 distinct digits

        if "".join([str(x) for x in buckets]) not in unique_buckets:
            unique_buckets.add("".join([str(x) for x in buckets]))
            strings.append("".join(random.sample(s, len(s))))

    data[num_digit] = strings
json.dump(data, open(f"../data/finetune/{TASK}/finetune_129_256.json", "w"), indent=2)
            

100%|██████████| 128/128 [00:10<00:00, 12.08it/s]


In [15]:
# Uniform split
samples_per_num_digit_train, samples_per_num_digit_val = 100, 100

data = json.load(open(f"../data/finetune/{TASK}/finetune.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

answers = []

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    for s in data[num_digit][:samples_per_num_digit_train]:
        answer = Counter(s).most_common(2)[1][0]
        answers.append(answer)
        row = {
            "input_str": [s],
            "answer": [answer]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [Counter(s).most_common(2)[1][0]]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

print("Check if answers are uniformly distributed across 0-9")
print(Counter(answers).most_common(10))
train.to_csv(f"../data/finetune/{TASK}/uniform_split/train.csv", index=False)
val.to_csv(f"../data/finetune/{TASK}/uniform_split/val.csv", index=False)


samples_per_num_digit = 1000


  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [00:06<00:00, 17.73it/s]

12000 12000
Check if answers are uniformly distributed across 0-9
[('4', 1277), ('3', 1233), ('1', 1213), ('2', 1206), ('0', 1204), ('5', 1202), ('6', 1185), ('8', 1168), ('9', 1166), ('7', 1146)]





In [2]:
# length_extrapolation split
samples_per_num_digit_train, samples_per_num_digit_val = 0, 900

data = json.load(open(f"../data/finetune/{TASK}/finetune_129_256.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    for s in data[num_digit][:samples_per_num_digit_train]:
        row = {
            "input_str": [s],
            "answer": [Counter(s).most_common(2)[1][0]]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [Counter(s).most_common(2)[1][0]]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

val.to_csv(f"../data/finetune/{TASK}/length_extrapolation_large/val.csv", index=False)

samples_per_num_digit = 1000


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 128/128 [01:36<00:00,  1.33it/s]


0 115200


In [8]:
128*900 / 64

1800.0

In [17]:
# Uniform hard split
samples_per_num_digit_train, samples_per_num_digit_val = 100, 100

data = json.load(open(f"../data/finetune/{TASK}/finetune_hard+.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

answers = []

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    for s in data[num_digit][:samples_per_num_digit_train]:
        answer = Counter(s).most_common(1)[0][0]
        answers.append(answer)
        row = {
            "input_str": [s],
            "answer": [Counter(s).most_common(2)[1][0]]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [Counter(s).most_common(2)[1][0]]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

print("Check if answers are uniformly distributed across 0-9")
print(Counter(answers).most_common(10))
train.to_csv(f"../data/finetune/{TASK}/uniform_hard+_split/train.csv", index=False)
val.to_csv(f"../data/finetune/{TASK}/uniform_hard+_split/val.csv", index=False)


samples_per_num_digit = 1000


  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [00:06<00:00, 17.83it/s]

12000 12000
Check if answers are uniformly distributed across 0-9
[('3', 1262), ('5', 1235), ('9', 1230), ('4', 1218), ('7', 1199), ('2', 1193), ('6', 1192), ('1', 1186), ('8', 1171), ('0', 1114)]



