### Download data from Kaggle

In [1]:
# Install Kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[?25l[K     |█████▋                          | 10 kB 1.5 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 2.7 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 3.9 MB/s eta 0:00:01[K     |██████████████████████▎         | 40 kB 5.0 MB/s eta 0:00:01[K     |███████████████████████████▉    | 51 kB 6.1 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.9 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=61b194c2eaa98208f6e49a0ead5cd23f8cd48dfd2995199c9106fd838525c996
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninst

In [2]:
import contextlib
from google.colab import files
import io

with contextlib.redirect_stdout(io.StringIO()):
    files.upload()

In [None]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

! kaggle competitions download -c AI4Code

!  unzip /content/AI4Code.zip -d /content/data

### Setup

In [4]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('/content/data/')

### Load Data

The number of notebooks read can be controlled by the parameter NUM_TRAIN

In [6]:
NUM_TRAIN = 100


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df.head()


Train NBs:   0%|          | 0/100 [00:00<?, ?it/s][A
Train NBs:  22%|██▏       | 22/100 [00:00<00:00, 217.22it/s][A
Train NBs:  44%|████▍     | 44/100 [00:00<00:00, 200.13it/s][A
Train NBs:  65%|██████▌   | 65/100 [00:00<00:00, 196.27it/s][A
Train NBs: 100%|██████████| 100/100 [00:00<00:00, 192.74it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
037f0f8814fc21,c3ba9f4c,code,import pandas as pd\npd.plotting.register_matplotlib_converters()\nimport matplotlib.pyplot as plt\n%matplotlib inli...
037f0f8814fc21,a1c32635,code,df = pd.read_csv('../input/tokyo-olympics-2020-tweets/tokyo_2020_tweets.csv')\ndf
037f0f8814fc21,12de50e5,code,df.info()
037f0f8814fc21,2b370d37,code,df.describe()
037f0f8814fc21,88ab6efc,code,df_hashtags = df[df['hashtags'].notnull()]\nprint(df.shape)\nprint(df_hashtags.shape)


In [7]:
# Get a sample notebook
nb_id = df.index.unique('id')[6]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()

Notebook: 1a597af742744a
The disordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
dd3085dc,code,import pandas as pd
ac4c16b5,code,import numpy as np
aaa95298,code,import matplotlib.pyplot as plt
4a376c90,code,### Import Data
51f8f4f8,code,"data = pd.read_csv(""../input/uber-request-data/Uber Request Data.csv"")"
...,...,...
80753c13,markdown,### Plot a histogram to show the time slots when trip from city to airport got cancelled
43486bf4,markdown,### Print Columns and Rows
1a169baf,markdown,### Filter trips from airport to city when cars were not available
e866c9a4,markdown,### Convert the hours column to list format





### Ordering the Cells

In the train_orders.csv file we have, for notebooks in the training set, the correct ordering of cells in terms of the cell ids.

In [9]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310c80, 073e27e5, 015d52a4, ad7679ef, 7fde4f04, 07c52510, 0a1a7a39, 0bcd3...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279279, df6c939f, 2476da96, 00f87d0a, ae93e8e6, 58aadb1d, d20b0094, 986fd...
0002115f48f982                                 [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe576, a3188e54, b3f6e12d, ee7655ca, 84125b7a]
                                                                           ...                                                           
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba953ee, bf92a015, f4a0492a, 095812e6, 53125cfe, aa32a700, 63340e73, 06d8c...
fffc3b44869198    [978a5137, fa

In [11]:
# Get the correct order
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

The ordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
755b88a6,markdown,### Import Libraries
dd3085dc,code,import pandas as pd
ac4c16b5,code,import numpy as np
aaa95298,code,import matplotlib.pyplot as plt
4a376c90,code,### Import Data
...,...,...
0750d3b4,markdown,### Plot a histogram to show the time slots when cars were not availabale for airport to city trip
13716a7a,code,plots(airport_to_city_not_available_hour)
b93c7aa9,markdown,### Conclusion
bf9162b5,markdown,#### 1. Heavy demand in the city from 4 AM to 11 PM


The correct numeric position of a cell we will call the rank of the cell. We can find the ranks of the cells within a notebook by referencing the true ordering of cell ids as given in train_orders.csv.

In [12]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dd3085dc,1,code,import pandas as pd
ac4c16b5,2,code,import numpy as np
aaa95298,3,code,import matplotlib.pyplot as plt
4a376c90,4,code,### Import Data
51f8f4f8,5,code,"data = pd.read_csv(""../input/uber-request-data/Uber Request Data.csv"")"
...,...,...,...
80753c13,67,markdown,### Plot a histogram to show the time slots when trip from city to airport got cancelled
43486bf4,6,markdown,### Print Columns and Rows
1a169baf,63,markdown,### Filter trips from airport to city when cars were not available
e866c9a4,77,markdown,### Convert the hours column to list format


The algorithm we'll be using for our baseline model uses the cell ranks as the target, so let's create a dataframe of the ranks for each notebook.

In [13]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
037f0f8814fc21,c3ba9f4c,0
037f0f8814fc21,a1c32635,2
037f0f8814fc21,12de50e5,6
037f0f8814fc21,2b370d37,7
037f0f8814fc21,88ab6efc,11
...,...,...
fed80901d135da,b86d4216,5
fed80901d135da,4b5aade7,22
fed80901d135da,277ae3de,26
fed80901d135da,b2fe2d6c,13


### Splits

The df_ancestors.csv file identifies groups of notebooks derived from a common origin, that is, notebooks belonging to the same forking tree.

In [14]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors.head()

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,


To prevent leakage, the test set has no notebook with an ancestor in the training set. We therefore form a validation split using ancestor_id as a grouping factor.

In [17]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

print('Total Training Examples : ', df_train.shape)
print('Total Validation Examples : ', df_valid.shape)

Total Training Examples :  (4239, 2)
Total Validation Examples :  (387, 2)


### Feature Engineering

Let's generate tf-idf features to use with our ranking model. These features will help our model learn what kinds of words tend to occur most often at various positions within a notebook.

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(min_df = 0.1)
# Training Data
X_train = tfidf.fit_transform(df_train['source'].astype(str))
# Training Labels
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

Now let's add the code cell ordering as a feature. We'll append a column that enumerates the code cells in the correct order, like 1, 2, 3, 4, ..., while having the dummy value 0 for all markdown cells. This feature will help the model learn to put the code cells in the correct order.

In [27]:
# Add code cell ordering
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
print(X_train.shape)

(4239, 11)


### Train