# Language

In [23]:
import os
from pathlib import Path
import json
import pickle

import tqdm 
from icl.constants import DEVICE

DATA = Path('../data')

def gen_from_jsonl(file_path, num_lines=5_000_000, start=0):
    with open(file_path, 'r') as file:
        for i, line in enumerate(file):
            if i == start:
                break

        for i, line in tqdm.tqdm(enumerate(file), total=num_lines, desc=f'Loading {file_path}'):
            if i >= start + num_lines:
                break

            yield json.loads(line)


with open(DATA / 'bigram_freq_percents.pkl', 'rb') as file:
    bigram_freqs = pickle.load(file)

bigram_freqs.shape # token 1, token 2

(5000, 5000)

In [9]:
from transformer_lens import HookedTransformerConfig, HookedTransformer

model_cfg = HookedTransformerConfig(
    n_layers=2,
    d_model=256,
    d_head=32,
    n_heads=8,
    n_ctx=1024,
    d_vocab=5000,
    tokenizer_name='georgeyw/TinyStories-tokenizer-5k',
    normalization_type='LN',
    attn_only=True,
    seed=1,
    positional_embedding_type='shortformer',
)

model_cfg_one_layer = HookedTransformerConfig(
    n_layers=1,
    d_model=256,
    d_head=32,
    n_heads=8,
    n_ctx=1024,
    d_vocab=5000,
    tokenizer_name='georgeyw/TinyStories-tokenizer-5k',
    normalization_type='LN',
    attn_only=True,
    seed=1,
    positional_embedding_type='shortformer',
)

model = HookedTransformer(model_cfg)
tokenizer = model.tokenizer

Downloading tokenizer_config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/65.7k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


In [33]:
import yaml
import torch
from typing import List

def translate_int_to_str(token_ints: List[int]):
    t = torch.tensor([token_ints], device=DEVICE)
    return model.to_str_tokens(t)


def preview_trigrams_dict(trigrams_dict):
    print("----- Top 1000 trigrams -----")
    for k, v in list(trigrams_dict.items())[:1000]:
        trigram = translate_int_to_str(k)
        print(f'{trigram}: {v}')


def gen_trigrams(file_path, tokenizer):
    for row in gen_from_jsonl(file_path):
        contents = row['contents']
        tokens = tokenizer(contents)['input_ids']

        for i in range(len(tokens) - 2):
            yield tokens[i:i+3]
   

def get_top_k_trigrams(file_path, k=10, multiplier=2, padding=10, get_count=lambda _: 1, reverse_count=lambda _: 1, tokenizer=tokenizer):
    table = {}

    for trigram in gen_trigrams(file_path, tokenizer):
        key = tuple(trigram)
        table[key] = table.get(key, 0.) + get_count(trigram)
        
        if len(table) > multiplier * k:
            table = dict(sorted(table.items(), key=lambda x: x[1], reverse=True)[:k + padding])
            print("----- Top 1000 trigrams -----")
            preview_trigrams_dict(table)

    return  dict({k: reverse_count(v) for k, v in sorted(table.items(), key=lambda x: x[1], reverse=True)[:k].items()})

def divide_by_last_2_bigram(trigram):
    # print(translate_int_to_str(trigram), bigram_freqs[trigram[1], trigram[2]])
    return 1. / (5000 * bigram_freqs[trigram[1], trigram[2]])

top_10_000_trigrams = get_top_k_trigrams(
    DATA / 'train-5m.jsonl', 
    k=10_000, 
    multiplier=100, 
    padding=10_000, 
    tokenizer=tokenizer, 
    get_count=divide_by_last_2_bigram
)

Loading ../data/train-5m.jsonl:   0%|          | 4255/5000000 [00:06<2:12:48, 626.91it/s]

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['as', ' ', 'ario']: 12138.64272
['as', 'th', 'aking']: 9631.2784
['ard', '.', 'ara']: 7772.098533333333
[' ', ' ', ' heat']: 6388.75932631579
['\n', ' ', ' brand']: 6069.32136
[' ', ' ', ' couple']: 5921.289131707317
['V', ' ', ' movie']: 4668.708738461539
['\n', ' ', ' score']: 4668.708738461539
['the', ' ', 'ancy']: 4495.7936
[' ', ' ', ' increasing']: 4046.2142400000002
[' ', ' ', ' projects']: 3915.6912
[' ', ' ', ' nearly']: 3678.376581818182
[' ', ' ', ' miss']: 3371.8452
[' ', ' ', ' paper']: 3280.714248648649
[' ', ' ', ' opportunity']: 3236.9713920000004
[' ', ' ', ' max']: 3222.6485097345135
['ide', ',', '�']: 3050.7619333333337
[' in', '\n', ' others']: 3046.894975
['ow', '\n', ' cat']: 3046.894975
[' was', ' ', ' supposed']: 3034.66068
[' ', ' ', ' weeks']: 2924.9741493975907
['\n', ' ', ' yes']: 2758.782436363637
[' ', ' ', ' mass']: 2758.782436363637
[' ', 'the', 'ese']: 2745.2198769230768
['ce', 'an', 'wards']:

Loading ../data/train-5m.jsonl:   0%|          | 4395/5000000 [00:08<9:17:33, 149.33it/s] 

['is', 'b', 'oid']: 135.43248157894735
['R', 'ex', 'iter']: 135.2016
['ions', ' ', 'ared']: 135.17419510022273
['ar', 'p', 'nes']: 135.0883090909091
['ering', ' on', ' claimed']: 135.08310666666665
[',', ' saw', ' expected']: 135.0542
['.', ' g', 'ae']: 134.95096451612903
['r', 'atic', ' largest']: 134.9232
['iz', 'en', 'pri']: 134.8579692307692
[' R', ' M', ' Re']: 134.81731612903226
['H', 'oc', ' 2012']: 134.782
[' support', '.', ' prof']: 134.77627514450867
[' sw', 'im', ' spell']: 134.4887142857143
[',', ' —', 'Up']: 134.4603
['k', 'ur', 'ides']: 134.39634545454544
[' n', 'orth', ' They']: 134.39600000000002
['r', 'du', ' policies']: 134.262
['r', 'du', ' movement']: 134.262
['V', 'ot', ' iss']: 134.03013333333334
[' stat', 'u', 'ory']: 133.98426
[' G', 'P', ' whe']: 133.7932857142857
['74', 'b', ' 32']: 133.67361818181817
['\n', 'In', ' restrict']: 133.65480000000002


Loading ../data/train-5m.jsonl:   0%|          | 8401/5000000 [00:14<1:57:20, 709.03it/s]

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['\n', '\n', '�']: 24375.1598
['as', ' ', 'ario']: 12138.64272
[' ', ' ', ' spokes']: 12138.64272
['as', 'th', 'aking']: 9631.2784
['ard', '.', 'ara']: 7772.098533333333
['i', ',', 'cher']: 6864.21435
[' ', ' ', ' heat']: 6388.75932631579
['\n', ' ', ' brand']: 6069.32136
[' ', ' ', ' couple']: 5921.289131707317
[',', '\n', ' school']: 4875.03196
[' ', ' ', ' disease']: 4855.457088
['V', ' ', ' movie']: 4668.708738461539
['\n', ' ', ' score']: 4668.708738461539
['the', ' ', 'ancy']: 4495.7936
[' ', ' ', ' increasing']: 4046.2142400000002
[' ', ' ', ' projects']: 3915.6912
[' L', ' ', 'arge']: 3793.32585
[' ', ' ', ' nearly']: 3678.376581818182
[' pro', 's', 'astic']: 3479.9289
[' ', ' ', ' miss']: 3371.8452
[' ', ' ', ' paper']: 3280.714248648649
[' ', ' ', ' speak']: 3280.714248648649
[' ', ' ', ' opportunity']: 3236.9713920000004
[' ', ' ', ' max']: 3222.6485097345135
['ide', ',', '�']: 3050.7619333333337
[' in', '\n', ' oth

Loading ../data/train-5m.jsonl:   0%|          | 8542/5000000 [00:15<7:49:14, 177.29it/s] 

[' ', ' ', ' Gl']: 247.98044371807967
[' ', 'the', ' been']: 247.83235
[',', ' he', 'well']: 247.79153333333332
['ng', ' T', 'anging']: 246.96356363636363
[' Ang', ' T', 'anging']: 246.96356363636363
['atives', ' has', ' hist']: 246.44537142857146
[' ', ' ', ' Ret']: 246.21993346855982
[' ', ' ', ' ins']: 246.21993346855982
[' ', 'the', ' having']: 246.12316137931035
[' b', 'ib', ' providing']: 245.8224
['�', '�', ' Aug']: 245.7733
['�', '�', 'net']: 245.7733
['�', '�', 'resp']: 245.7733
['te', 'am', '200']: 245.42234285714284
['a', ' ', ' ret']: 245.22510545454548
['ine', ',', '�']: 245.1505125
['\n', ' ', ' back']: 244.7307
['ir', 'ley', 'actions']: 244.46220000000002
['to', ' his', 'What']: 244.38510000000002
[' can', ' place', ' hair']: 244.19440000000003
[',', '\n', '�']: 244.02273437152394
[' 6', '3', 'aud']: 243.9137
[' ', 'the', 'er']: 243.60312901023892
['00', 'z', ' {']: 243.52386666666663
['ial', ' was', ' Republic']: 243.1791142857143
[' it', ' was', ' Cour']: 243.179114285

Loading ../data/train-5m.jsonl:   0%|          | 12658/5000000 [00:21<1:57:28, 707.60it/s]

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['\n', '\n', '�']: 24375.1598
['as', ' ', 'ario']: 12138.64272
[' ', ' ', ' spokes']: 12138.64272
['the', ' in', 'uth']: 11283.028400000001
['as', 'th', 'aking']: 9631.2784
['\n', ' ', ' fan']: 9337.417476923078
['ard', '.', 'ara']: 7772.098533333333
['i', ',', 'cher']: 6864.21435
[' ', ' ', ' heat']: 6388.75932631579
[')', ' ', ' couldn']: 6388.75932631579
['\n', ' ', ' brand']: 6069.32136
[' ', ' ', ' couple']: 5921.289131707317
['M', 'ed', 'vel']: 5655.241550000001
[',', '\n', ' school']: 4875.03196
[',', '\n', ' det']: 4875.03196
[' ', ' ', ' disease']: 4855.457088
['V', ' ', ' movie']: 4668.708738461539
['\n', ' ', ' score']: 4668.708738461539
['the', ' ', 'ancy']: 4495.7936
['\n', ' ', ' strength']: 4335.229542857143
[' ', ' ', ' increasing']: 4046.2142400000002
[' ', ' ', ' projects']: 3915.6912
[' ', ' ', ' mission']: 3915.6912
[' L', ' ', 'arge']: 3793.32585
[' ', ' ', ' nearly']: 3678.376581818182
['al', ' ', ' group

Loading ../data/train-5m.jsonl:   0%|          | 16837/5000000 [00:29<2:02:39, 677.08it/s] 

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['am', ',', 'isk']: 27456.8574
['\n', '\n', '�']: 24375.1598
['a', ',', 'ared']: 13728.4287
['as', ' ', 'ario']: 12138.64272
[' ', ' ', ' spokes']: 12138.64272
['the', ' in', 'uth']: 11283.028400000001
['as', 'th', 'aking']: 9631.2784
['\n', ' ', ' fan']: 9337.417476923078
['ard', '.', 'ara']: 7772.098533333333
[' ', ' ', ' mov']: 6949.604610687023
['i', ',', 'cher']: 6864.21435
['ia', ',', 'vil']: 6864.21435
[' ', ' ', ' heat']: 6388.75932631579
[')', ' ', ' couldn']: 6388.75932631579
['\n', ' ', ' brand']: 6069.32136
['port', ' ', ' offers']: 6069.32136
[' ', ' ', ' couple']: 5921.289131707317
[' ', ' ', ' basic']: 5780.306057142857
['M', 'ed', 'vel']: 5655.241550000001
['ilt', ',', 'ist']: 5491.37148
['artment', ',', 'als']: 5491.37148
['iv', ',', 'als']: 5491.37148
[' ', ' ', ' path']: 4988.483309589041
[',', '\n', ' school']: 4875.03196
[',', '\n', ' det']: 4875.03196
[' ', ' ', ' disease']: 4855.457088
[' ', ' ', ' stori

Loading ../data/train-5m.jsonl:   0%|          | 21030/5000000 [00:37<2:09:49, 639.16it/s] 

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['am', ',', 'isk']: 27456.8574
['\n', '\n', '�']: 24375.1598
['a', ',', 'ared']: 13728.4287
['as', ' ', 'ario']: 12138.64272
[' ', ' ', ' spokes']: 12138.64272
[' T', 'the', 'ior']: 11895.952800000001
['the', ' in', 'uth']: 11283.028400000001
['as', 'th', 'aking']: 9631.2784
['\n', ' ', ' fan']: 9337.417476923078
[' ', ' ', ' slight']: 9337.417476923078
[' Rich', '\n', 'ird']: 8125.053266666668
['cer', ' of', 'ry']: 7958.930700000001
['ard', '.', 'ara']: 7772.098533333333
[' ', ' ', ' mov']: 6949.604610687023
['i', ',', 'cher']: 6864.21435
['ia', ',', 'vil']: 6864.21435
[' ', ' ', ' heat']: 6388.75932631579
[')', ' ', ' couldn']: 6388.75932631579
['ail', '\n', 'oad']: 6093.78995
['\n', ' ', ' brand']: 6069.32136
['port', ' ', ' offers']: 6069.32136
[' grow', 'the', 'conom']: 5947.9764000000005
[' ', ' ', ' couple']: 5921.289131707317
[' ', ' ', ' basic']: 5780.306057142857
['M', 'ed', 'vel']: 5655.241550000001
['ilt', ',', 'is

Loading ../data/train-5m.jsonl:   0%|          | 21094/5000000 [00:38<11:54:34, 116.13it/s]

['\n\n', ' ', ' Lab']: 541.9036928571429
[' ', ' ', ' property']: 541.9036928571428
['M', 'e', 'hern']: 541.3744666666666
[' ', 'the', 'du']: 540.7251272727273
[' ', 'the', 'formation']: 540.7251272727273
[',', ' ', 'ents']: 539.495232
[' ', ' ', ' //']: 538.8964581576026
[' witness', 'ed', 'rain']: 538.5944333333333
['im', 'g', 'cle']: 537.5437999999999
['ar', 'th', ' sometimes']: 535.0710222222222
[' dep', 'th', ' million']: 535.0710222222222
['ar', 'th', 'urn']: 535.0710222222222
[' or', 'th', 'oh']: 535.0710222222222
[' )', ' R', 'mm']: 533.5618
[';', ' ', ' nor']: 532.3966105263157
['l', 'ab', ' manager']: 531.1152571428571
[' ', ' ', ' ID']: 530.0717344978166
['\n', ' ', ' respect']: 530.0717344978166
[' ', ' ', ' house']: 530.0717344978166
['ables', '.', ' measure']: 529.9158090909091
[' ed', '.', ' whole']: 529.9158090909091
['44', '\n', ' 38']: 529.8947782608695
['ical', ' year', 'x']: 529.605
[' Sh', 'am', 'roy']: 528.6019692307692
[' Ch', 'an', 'ler']: 525.9578181818182
[' '

Loading ../data/train-5m.jsonl:   0%|          | 22393/5000000 [00:40<2:05:27, 661.26it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (6337 > 2048). Running this sequence through the model will result in indexing errors
Loading ../data/train-5m.jsonl:   1%|          | 25200/5000000 [00:45<2:07:59, 647.77it/s]

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['am', ',', 'isk']: 27456.8574
['\n', '\n', '�']: 24375.1598
['a', ',', 'ared']: 13728.4287
['as', ' ', 'ario']: 12138.64272
[' ', ' ', ' spokes']: 12138.64272
[' T', 'the', 'ior']: 11895.952800000001
['the', ' in', 'uth']: 11283.028400000001
['as', 'th', 'aking']: 9631.2784
['\n', ' ', ' fan']: 9337.417476923078
[' ', ' ', ' slight']: 9337.417476923078
[' Rich', '\n', 'ird']: 8125.053266666668
['cer', ' of', 'ry']: 7958.930700000001
['ard', '.', 'ara']: 7772.098533333333
[' ', ' ', ' mov']: 6949.604610687023
['i', ',', 'cher']: 6864.21435
['ia', ',', 'vil']: 6864.21435
[' ', ' ', ' heat']: 6388.75932631579
[')', ' ', ' couldn']: 6388.75932631579
['ail', '\n', 'oad']: 6093.78995
['\n', ' ', ' brand']: 6069.32136
['port', ' ', ' offers']: 6069.32136
[' grow', 'the', 'conom']: 5947.9764000000005
[' ', ' ', ' couple']: 5921.289131707317
[' ', ' ', ' basic']: 5780.306057142857
['M', 'ed', 'vel']: 5655.241550000001
['ilt', ',', 'is

Loading ../data/train-5m.jsonl:   1%|          | 25265/5000000 [00:46<12:11:11, 113.39it/s]

[' ', 'the', 'et']: 626.1027789473684
['\n', ' ', ' very']: 625.7032329896907
[' ', ' ', ' ref']: 625.7032329896907
[' ', ' ', ' son']: 625.7032329896907
['\n', ' ', ' front']: 625.7032329896907
['w', 'ast', 'ocol']: 623.5982
['l', 'ast', ' valid']: 623.5982
[' ', ' ', ' Ass']: 623.1336098562629
['i', ' It', 'ya']: 622.576
['\n', ' ', ' came']: 622.4944984615385
['ion', ' from', 'eth']: 619.94155
['ERS', ' v', 'ER']: 619.5292000000001
['gy', ' v', 'uth']: 619.5292000000001
[' "', 'he', 'As']: 619.1162
['ft', 'he', 'IS']: 619.1162
['man', ' D', 'iday']: 618.648
['l', ',', '.,']: 617.0080314606741
['.', ' ', ' Le']: 616.8457280580761
[' The', ' ', ' line']: 616.1747573604061
['\n', ' ', ' letter']: 616.1747573604061
[').', ' ', ' Secretary']: 616.1747573604061
[' ', ' ', ' Secretary']: 616.1747573604061
[' ', ' ', ' line']: 616.1747573604061
['1', ';', 'ful']: 616.0852
[' ', 'the', 'og']: 615.3079034482759
['\n', ' ', ' written']: 613.0627636363637
['e', '-', ' bi']: 612.8601647058823
['

Loading ../data/train-5m.jsonl:   1%|          | 29395/5000000 [00:52<2:03:12, 672.34it/s] 

----- Top 1000 trigrams -----
----- Top 1000 trigrams -----
['am', ',', 'isk']: 27456.8574
['\n', '\n', '�']: 24375.1598
['a', ',', 'ared']: 13728.4287
['as', ' ', 'ario']: 12138.64272
[' ', ' ', ' spokes']: 12138.64272
[' T', 'the', 'ior']: 11895.952800000001
['the', ' in', 'uth']: 11283.028400000001
['as', 'th', 'aking']: 9631.2784
['\n', ' ', ' fan']: 9337.417476923078
[' ', ' ', ' slight']: 9337.417476923078
[' Rich', '\n', 'ird']: 8125.053266666668
[' ', '\n', ' follow']: 8125.053266666668
['cer', ' of', 'ry']: 7958.930700000001
['ard', '.', 'ara']: 7772.098533333333
[' of', ' ', 'adow']: 7140.378070588235
[' ', ' ', ' mov']: 6949.604610687023
['i', ',', 'cher']: 6864.21435
['ia', ',', 'vil']: 6864.21435
[' ', ' ', ' heat']: 6388.75932631579
[')', ' ', ' couldn']: 6388.75932631579
['.', ' ', ' Democr']: 6388.75932631579
['ail', '\n', 'oad']: 6093.78995
[',', '\n', ' introdu']: 6093.78995
['\n', ' ', ' brand']: 6069.32136
['port', ' ', ' offers']: 6069.32136
['\n', ' ', ' saf']: 60

Loading ../data/train-5m.jsonl:   1%|          | 32019/5000000 [00:58<2:31:57, 544.86it/s] 


KeyboardInterrupt: 

TODOs
- Normal top-k trigrams
- Normal top-k skip trigrams
- Adjusted (by last-2 bigram freq) top-k trigrams
- Adjusted (by last-2 bigram freq) top-k skip trigrams

Partition into subsets of 100k rows:
- Repeat for each. Look at differences/commonalities. 
