#### Setup

In [None]:
# Install Kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle

In [2]:
import contextlib
from google.colab import files
import io

with contextlib.redirect_stdout(io.StringIO()):
    files.upload()

In [None]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

! kaggle competitions download -c AI4Code

!  unzip /content/AI4Code.zip -d /content/data

! rm -rf /content/AI4Code.zip

### Imports and Globals

In [5]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = "../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased"

data_dir = Path('/content/data')

### Read input data

In [6]:
NUM_TRAIN = 10000


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df

Train NBs: 100%|██████████| 10000/10000 [01:15<00:00, 132.65it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
000a4651cce8f4,9ae35818,code,"import os\nimport time\nimport requests\n\nimport jax\nimport jax.numpy as jnp\nfrom jax import jit, grad, random\n\..."
000a4651cce8f4,f1dee85b,code,"def apply_activation(x):\n return jnp.maximum(0.0, x)\n\ndef get_dot_product(W, X):\n return jnp.dot(W, X)"
000a4651cce8f4,00e71412,code,"# Always use a seed\nkey = random.PRNGKey(1234)\nW = random.normal(key=key, shape=[1000, 10000], dtype=jnp.float32)\..."
000a4651cce8f4,5bcfa9ad,code,# Make jaxpr for the activation function\nprint(jax.make_jaxpr(activation_jit)(Z))
000a4651cce8f4,f417baab,code,"# Make jaxpr for the activation function\nprint(jax.make_jaxpr(dot_product_jit)(W, X))"
...,...,...,...
fffc30d5a0bc46,3ba953ee,markdown,#### Checking the dataframe
fffc30d5a0bc46,53125cfe,markdown,#### Checking the number of nulls in percentage
fffc30d5a0bc46,63340e73,markdown,#### Now we will try to impute null with the maximum occured values
fffc30d5a0bc46,09727c0c,markdown,#### Importing necessary libraries


Let's analyse an example notebook.

* df created above contains all the commands (disordered)
* Order df (train_orders.csv) contains the order of the cells in the notebooks

In [7]:
nb_id = df.index.unique('id')[6]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()

Notebook: 005732b128455c
The disordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3c9d50de,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
8a48b85b,code,url = 'https://www.kaggleusercontent.com/episodes/4231325.json'
0adc9518,code,"import urllib, json\n\nresponse = urllib.request.urlopen(url)\ndata = json.loads(response.read())\n"
10de9bf7,code,data.keys()
940f99ee,code,data
ff6affb7,code,dict_obs = {}\nfor i in range(1000):\n player1 = data['steps'][i][0]\n player2 = data['steps'][i][1]\n dict...
27fdb8bd,code,df = pd.DataFrame(dict_obs)\ndf = df.T\ndf
603f43db,code,df.describe()
2be92532,code,import matplotlib.pyplot as plt
8163c78d,code,df['player1_action'].plot()





In [8]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310c80, 073e27e5, 015d52a4, ad7679ef, 7fde4f04, 07c52510, 0a1a7a39, 0bcd3...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279279, df6c939f, 2476da96, 00f87d0a, ae93e8e6, 58aadb1d, d20b0094, 986fd...
0002115f48f982                                 [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe576, a3188e54, b3f6e12d, ee7655ca, 84125b7a]
                                                                           ...                                                           
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba953ee, bf92a015, f4a0492a, 095812e6, 53125cfe, aa32a700, 63340e73, 06d8c...
fffc3b44869198    [978a5137, fa

In [11]:
len(df_orders.loc[nb_id])

14

In [10]:
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

The ordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3c9d50de,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
3619e413,markdown,Credits goes to https://www.kaggle.com/arunprathap/replay-json-eda-i for this notebook
8a48b85b,code,url = 'https://www.kaggleusercontent.com/episodes/4231325.json'
b79c37ef,markdown,## Load Replay
0adc9518,code,"import urllib, json\n\nresponse = urllib.request.urlopen(url)\ndata = json.loads(response.read())\n"
04b2e812,markdown,## Explore Replay
10de9bf7,code,data.keys()
940f99ee,code,data
ff6affb7,code,dict_obs = {}\nfor i in range(1000):\n player1 = data['steps'][i][0]\n player2 = data['steps'][i][1]\n dict...
27fdb8bd,code,df = pd.DataFrame(dict_obs)\ndf = df.T\ndf


Add rank to the cells in train_orders df

In [12]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

# testing the function on the above example train_order sample
cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3c9d50de,0,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
8a48b85b,2,code,url = 'https://www.kaggleusercontent.com/episodes/4231325.json'
0adc9518,4,code,"import urllib, json\n\nresponse = urllib.request.urlopen(url)\ndata = json.loads(response.read())\n"
10de9bf7,6,code,data.keys()
940f99ee,7,code,data
ff6affb7,8,code,dict_obs = {}\nfor i in range(1000):\n player1 = data['steps'][i][0]\n player2 = data['steps'][i][1]\n dict...
27fdb8bd,9,code,df = pd.DataFrame(dict_obs)\ndf = df.T\ndf
603f43db,10,code,df.describe()
2be92532,11,code,import matplotlib.pyplot as plt
8163c78d,12,code,df['player1_action'].plot()


Assign correct cell order rank in the df created from raw data

In [16]:
# Step 1 : Add correct cell order to df, using order in df_orders 
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

# Step 2 - Get rank (here cell_order is base and cell_id retreived from df_orders is the derived)
ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

# Step 3 - Create a dataframe out of it
df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
000a4651cce8f4,9ae35818,2
000a4651cce8f4,f1dee85b,3
000a4651cce8f4,00e71412,4
000a4651cce8f4,5bcfa9ad,7
000a4651cce8f4,f417baab,9
...,...,...
fffc30d5a0bc46,3ba953ee,4
fffc30d5a0bc46,53125cfe,8
fffc30d5a0bc46,63340e73,10
fffc30d5a0bc46,09727c0c,0


Read in ancestors data

In [17]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,
...,...,...
fffc30d5a0bc46,6aed207b,
fffc3b44869198,a6aaa8d7,
fffc63ff750064,0a1b5b65,
fffcd063cda949,d971e960,


Merge ranks and ancestors to raw data

In [19]:
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df

Unnamed: 0,index,id,cell_id,cell_type,source,rank_x,ancestor_id_x,parent_id_x,rank_y,ancestor_id_y,parent_id_y
0,0,000a4651cce8f4,9ae35818,code,"import os\nimport time\nimport requests\n\nimport jax\nimport jax.numpy as jnp\nfrom jax import jit, grad, random\n\...",2,4db21994,,2,4db21994,
1,1,000a4651cce8f4,f1dee85b,code,"def apply_activation(x):\n return jnp.maximum(0.0, x)\n\ndef get_dot_product(W, X):\n return jnp.dot(W, X)",3,4db21994,,3,4db21994,
2,2,000a4651cce8f4,00e71412,code,"# Always use a seed\nkey = random.PRNGKey(1234)\nW = random.normal(key=key, shape=[1000, 10000], dtype=jnp.float32)\...",4,4db21994,,4,4db21994,
3,3,000a4651cce8f4,5bcfa9ad,code,# Make jaxpr for the activation function\nprint(jax.make_jaxpr(activation_jit)(Z)),7,4db21994,,7,4db21994,
4,4,000a4651cce8f4,f417baab,code,"# Make jaxpr for the activation function\nprint(jax.make_jaxpr(dot_product_jit)(W, X))",9,4db21994,,9,4db21994,
...,...,...,...,...,...,...,...,...,...,...,...
455708,455708,fffc30d5a0bc46,3ba953ee,markdown,#### Checking the dataframe,4,6aed207b,,4,6aed207b,
455709,455709,fffc30d5a0bc46,53125cfe,markdown,#### Checking the number of nulls in percentage,8,6aed207b,,8,6aed207b,
455710,455710,fffc30d5a0bc46,63340e73,markdown,#### Now we will try to impute null with the maximum occured values,10,6aed207b,,10,6aed207b,
455711,455711,fffc30d5a0bc46,09727c0c,markdown,#### Importing necessary libraries,0,6aed207b,,0,6aed207b,
