In [1]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRanker
from bisect import bisect

In [12]:
num_train = 10000

def read_notebook(path,id_name):
    return (
        pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=id_name)
        .rename_axis('cell_id')
    )

paths = []
directory = 'train'
for file in os.scandir(directory):
    if file.is_file():
        paths.append(file.path)
    if len(paths) == num_train:
        break
        
id_names = []
for name in paths:
    name = name.split('/')
    id_n = name[-1].split('.')
    id_names.append(id_n[0])
    
# print(id_names)
# print(paths)

train_notebooks = []
for i in range(len(paths)):
    train_notebooks.append(read_notebook(paths[i],id_names[i]))
    
print(train_notebooks[0])

         cell_type                                             source  \
cell_id                                                                 
02373f0f      code  import numpy as np \nimport pandas as pd\nimpo...   
7eee6f0d      code  # training data\ntrain_dts = pd.read_csv('../i...   
e38ca82d      code  # test data\ntest_dts = pd.read_csv('../input/...   
9376e7b4      code  female = train_dts.loc[train_dts.Sex=='female'...   
dfdb7c2c      code  print('Shape of Training Set : {}'.format(trai...   
...            ...                                                ...   
88243368  markdown  after the final processing of features we are ...   
604f6956  markdown  Calculating Survival rate of Male and Female o...   
0562b43b  markdown           creating titles from names of passengers   
4f0494f1  markdown  can be seen clearly that female has much large...   
d1798638  markdown  # 4. Creating train and test set and label enc...   

                      id  
cell_id                

In [16]:
# print(type(train_notebooks[0]))
df = (
    pd.concat(train_notebooks)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0002115f48f982,18281c6c,code,import numpy as np # linear algebra\nimport pa...
0002115f48f982,e3b6b115,code,df = pd.read_csv('../input/metadata_train.csv'...
0002115f48f982,4a044c54,code,df.head()
0002115f48f982,365fe576,code,#let's check if targets are consistent within ...
0002115f48f982,a3188e54,code,"sns.countplot(x='target',data=targets)\n# it s..."


In [17]:
df_orders = pd.read_csv('train_orders.csv',index_col='id',squeeze=True).str.split() 

In [18]:
df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b7...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c4172...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279...
0002115f48f982    [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe...
                                        ...                        
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba95...
fffc3b44869198    [978a5137, faa48f03, 28dfb12a, eea2e812, 64fef...
fffc63ff750064    [5015c300, 411b85d9, 8238198c, f4781d1d, b5532...
fffcd063cda949    [7e6266ad, d8281fc5, d4ffcaef, 3e0e4a47, 21387...
fffe1d764579d5    [1a63248d, 9c3b96a5, 1398a873, 4e2d4c2d, f71c5...
Name: cell_order, Length: 139256, dtype: object

In [42]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

def get_ranks(base, derived):
    return [base.index(d) for d in derived]

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
0002115f48f982,18281c6c,1
0002115f48f982,e3b6b115,2
0002115f48f982,4a044c54,3
0002115f48f982,365fe576,4
0002115f48f982,a3188e54,5
...,...,...
ffe8d0aa5e7d68,b9f0782a,54
ffe8d0aa5e7d68,3492f280,45
ffe8d0aa5e7d68,eea09e6e,59
ffe8d0aa5e7d68,54ffd613,68


In [23]:
df_ancestors = pd.read_csv('train_ancestors.csv', index_col='id')
df_ancestors

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,
...,...,...
fffc30d5a0bc46,6aed207b,
fffc3b44869198,a6aaa8d7,
fffc63ff750064,0a1b5b65,
fffcd063cda949,d971e960,


In [24]:
siz = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=siz, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

In [25]:
# Training set
tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train['source'].astype(str))
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [27]:
# Add code cell ordering
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
print(X_train.shape)

(407571, 283)


In [29]:
model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=10, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

In [30]:
# Validation set
X_valid = tfidf.transform(df_valid['source'].astype(str))
# The metric uses cell ids
y_valid = df_orders.loc[ids_valid]

X_valid = sparse.hstack((
    X_valid,
    np.where(
        df_valid['cell_type'] == 'code',
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [31]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)
y_pred.head(10)

id
00bde10ee58c1f    [5ef6c342, b81ded66, 2ef26226, ae62aad7, 2ec20...
00f8e8fc4dc5e8    [862a011f, 37fe9480, 1a89dda0, b4725a0e, 6916f...
011330cdca9289    [491dcdae, 85723a78, aba432c8, 9229246b, 2bec9...
0142dfccc7c3a4    [1fb690b7, 11f634e5, 5d65d64b, 0566003f, e2a90...
016fcab7d389b4    [281fa3b1, 13e9ac98, 6aba28fe, 41e2ca2b, 75a37...
01ba87cae4f1a9    [53cb919f, 342db185, 99497160, 2d5858db, 9d531...
01bef95941a730    [0f4bcaf4, 7d4fff5f, 739f47d7, ffb90f0b, 64fb9...
01ff62dc26293c    [b0874275, b22b40eb, 4824b2a5, 91e76bf5, 8d546...
022bdd0da2ce56    [be733fc1, 641cb264, 133ca39b, 53642361, 5bdd0...
028203c05df8f4    [0791a0c5, b3d9ad9d, ec158a7b, 7afcbd8b, f91d3...
Name: cell_id, dtype: object

In [32]:
nb_id = df_valid.index.get_level_values('id').unique()[8]

display(df.loc[nb_id])
display(df.loc[nb_id].loc[y_pred.loc[nb_id]])


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
133ca39b,code,from mpl_toolkits.mplot3d import Axes3D\nfrom ...
53642361,code,"for dirname, _, filenames in os.walk('/kaggle/..."
e791394a,code,# Distribution graphs (histogram/bar graph) of...
ad5cd259,code,# Correlation matrix\ndef plotCorrelationMatri...
cd9b55f4,code,# Scatter and density plots\ndef plotScatterMa...
c6a255c4,code,nRowsRead = 1000 # specify 'None' if want to r...
4febd15b,code,df1.head(5)
3eae2377,code,"plotPerColumnDistribution(df1, 10, 5)"
415b7dd8,markdown,The next hidden code cells define functions fo...
13bb798e,markdown,Now you're ready to read in the data and use t...


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
be733fc1,markdown,## Introduction\nGreetings from the Kaggle bot...
641cb264,markdown,## Exploratory Analysis\nTo begin this explora...
133ca39b,code,from mpl_toolkits.mplot3d import Axes3D\nfrom ...
53642361,code,"for dirname, _, filenames in os.walk('/kaggle/..."
5bdd0e44,markdown,There is 1 csv file in the current version of ...
e791394a,code,# Distribution graphs (histogram/bar graph) of...
ad5cd259,code,# Correlation matrix\ndef plotCorrelationMatri...
415b7dd8,markdown,The next hidden code cells define functions fo...
13bb798e,markdown,Now you're ready to read in the data and use t...
cd9b55f4,code,# Scatter and density plots\ndef plotScatterMa...


In [33]:
def count_inversions_slowly(ranks):
    inversions = 0
    size = len(ranks)
    for i in range(size):
        for j in range(i+1, size):
            if ranks[i] > ranks[j]:
                total += 1
    return total

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  
        j = bisect(sorted_so_far, u)  
        inversions += i - j
        sorted_so_far.insert(j, u)  
    return inversions

def kendall_tau(ground_truth, predictions):
    total_inversions = 0  
    total_2max = 0  
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [34]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy)

0.42327885114669606

In [35]:
kendall_tau(y_valid, y_pred)

0.6150327340854895

In [36]:
paths = []
directory = 'test'
for file in os.scandir(directory):
    if file.is_file():
        paths.append(file.path)
    if len(paths) == num_train:
        break
        
id_names = []
for name in paths:
    name = name.split('/')
    id_n = name[-1].split('.')
    id_names.append(id_n[0])
    
# print(id_names)
# print(paths)

test_notebooks = []
for i in range(len(paths)):
    test_notebooks.append(read_notebook(paths[i],id_names[i]))
    
print(test_notebooks[0])

         cell_type                                             source  \
cell_id                                                                 
aafc3d23      code  \n# Essential\nimport numpy as np\nimport pand...   
80e077ec      code  train_data = pd.read_csv('../input/titanic/tra...   
b190ebb4      code                              train_data.describe()   
ed415c3c      code  print('Number of rows ',len(train_data))\nprin...   
322850af      code  full_data['FamilyMembers'] = full_data['SibSp'...   
...            ...                                                ...   
d2f722a5  markdown  ## Conclusion\nI tried to do a little bit of e...   
8a0842b8  markdown       Good, now we can look at the updated dataset   
03cb1feb  markdown  To correctly choose the right model for our ta...   
83514fa3  markdown  As we probably expected, `Sex` is the most imp...   
d3f5c397  markdown  We have 177 rows with missing `Age` and 687 ro...   

                      id  
cell_id                

In [37]:
# print(type(train_notebooks[0]))
df_test = (
    pd.concat(test_notebooks)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0009d135ece78d,ddfd239c,code,import numpy as np # linear algebra\nimport pa...
0009d135ece78d,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-...
0009d135ece78d,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['..."
0009d135ece78d,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name..."
0009d135ece78d,7f388a41,code,# Ploting data with different columns\n#######...


In [38]:
X_test = tfidf.transform(df_test['source'].astype(str))
X_test = sparse.hstack((
    X_test,
    np.where(
        df_test['cell_type'] == 'code',
        df_test.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [39]:
y_infer = pd.DataFrame({'rank': model.predict(X_test)}, index=df_test.index)
y_infer = y_infer.sort_values(['id', 'rank']).reset_index('cell_id').groupby('id')['cell_id'].apply(list)
y_infer

id
0009d135ece78d    [ddfd239c, c6cd22db, 1372ae9b, 7f388a41, 90ed0...
0010483c12ba9b    [54c7cab3, fe66203e, 7844d5f8, 7f270e34, 5ce88...
0010a919d60e4f    [aafc3d23, b7578789, 80e077ec, b190ebb4, ed415...
0028856e09c5b7             [012c9d02, d22526d1, eb293dfc, 3ae7ece3]
Name: cell_id, dtype: object

In [40]:
y_submit = (
    y_infer
    .apply(' '.join)  # list of ids -> string of ids
    .rename_axis('id')
    .rename('cell_order')
)
y_submit

id
0009d135ece78d    ddfd239c c6cd22db 1372ae9b 7f388a41 90ed07ab 2...
0010483c12ba9b    54c7cab3 fe66203e 7844d5f8 7f270e34 5ce8863c 4...
0010a919d60e4f    aafc3d23 b7578789 80e077ec b190ebb4 ed415c3c c...
0028856e09c5b7                  012c9d02 d22526d1 eb293dfc 3ae7ece3
Name: cell_order, dtype: object

In [41]:
y_submit.to_csv('submission.csv')