In [1]:
import json, os, glob, psutil, sys
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
from tqdm import notebook
notebook.tqdm.pandas()
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

import pairwise as ssp

import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import GroupShuffleSplit


from pyxtension.streams import stream
import swifter

from copy import deepcopy

print('__CUDA VERSION:', torch.version.cuda)
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.empty_cache())
    print(torch.cuda.memory_summary(device=None, abbreviated=False))

__CUDA VERSION: 11.5
__CUDNN VERSION: 8302
True
None
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|----------

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yazee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yazee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
RANDOM_SEED = 42
PROCESSORS_COUNT = psutil.cpu_count(logical=False)
np.random.seed(RANDOM_SEED)


data_dir = Path('.')
train_dir = os.path.join(data_dir, 'train')
models_dir = os.path.join(data_dir, 'pt_models')
orders_path = os.path.join(data_dir, 'train_orders.csv')
ancestors_path = os.path.join(data_dir, 'train_ancestors.csv')


count = len(list(glob.iglob(os.path.join(train_dir, '*.json'))))
NUM_TRAIN = int(count * 0.01) + 1
# NUM_TRAIN = 100
MULTI = PROCESSORS_COUNT * int(str(NUM_TRAIN) [:-2])

print(f"\033[94mNumber of notebooks present in train set  = ", count)
print(f"\033[94mNumber of notebooks contribute in training = ", NUM_TRAIN)

[94mNumber of notebooks present in train set  =  139256
[94mNumber of notebooks contribute in training =  1393


In [3]:
df = ssp.read_all_notebooks_(train_dir, NUM_TRAIN, PROCESSORS_COUNT)
display(df.head())

print('-' * 125)

# Read Ordering data
df_orders = pd.read_csv(
    orders_path,
    index_col='id',
)
df_orders['cell_order'] = df_orders['cell_order'].str.split()  # Split the string representation of cell_ids into a list
df_orders = df_orders.squeeze(axis=1)


# build ranks as integers 
df = df.join(ssp.build_ranks_(df_orders, df, PROCESSORS_COUNT))


# Read Ancestors data
df = df.reset_index().merge(pd.read_csv(ancestors_path,  index_col='id'), on=["id"])

# convert integer ranks to percentages 
df["rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")

display(df.head())

Train NBs: 100%|██████████| 1393/1393 [00:08<00:00, 159.90it/s]


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 64557 entries, ('00001756c60be8', '1862f0a6') to ('02981070f7299c', 'a36f93a7')
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   cell_type  64557 non-null  category
 1   source     64557 non-null  object  
dtypes: category(1), object(1)
memory usage: 3.5+ MB
None


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros..."
00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore')
00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14})
00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\..."


-----------------------------------------------------------------------------------------------------------------------------


100%|██████████| 1393/1393 [00:00<00:00, 49834.66it/s]


Pandas Apply:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...,0.0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros...",0.034483,945aea18,
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),0.068966,945aea18,
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),0.103448,945aea18,
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",0.137931,945aea18,


In [4]:
dict_cellid_source = dict(zip(df['cell_id'].values, df['source'].values))
MKDN = 'markdown'
CODE = 'code'
df.loc[df.cell_type == MKDN, 'source'] = df.loc[df.cell_type == MKDN, 'source'].swifter.apply(ssp.markdown_lines_preprocess)
df.loc[df.cell_type != MKDN, 'source'] = df.loc[df.cell_type != MKDN, 'source'].swifter.apply(ssp.code_lines_preprocess)
df.head()

Pandas Apply:   0%|          | 0/21989 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/42568 [00:00<?, ?it/s]

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...,0.0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\nfrom sklearn.model_selection import train_test_split, cross_...",0.034483,945aea18,
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),0.068966,945aea18,
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),0.103448,945aea18,
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",0.137931,945aea18,


In [5]:
MODEL_USELESS = ['id', 'cell_type', 'ancestor_id', 'parent_id', ]
VALIDATION_RATIO = 0.15

MAX_LENGTH = 196

BATCH_SIZE = 8
EPOCHS = 1
TOTAL_MAX_LEN = 412

In [6]:
from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(n_splits=1, test_size=VALIDATION_RATIO, random_state=RANDOM_SEED)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids_train, ids_valid = next(splitter.split(df, groups=df["ancestor_id"]))
print(ids_train[:5], ids_valid[:5])

# extract code cells for each notebook
df_train = df.loc[ids_train, :].reset_index(drop=True)
df_valid = df.loc[ids_valid, :].reset_index(drop=True)
display(df_train.head(2))
display(df_valid.head(2))

[0 1 2 3 4] [402 403 404 405 406]


Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...,0.0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\nfrom sklearn.model_selection import train_test_split, cross_...",0.034483,945aea18,


Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00035108e64677,2fa1f27b,code,# Basic Libraries\nimport numpy as np\nimport pandas as pd\nimport seaborn as sb\nimport matplotlib.pyplot as plt #...,0.029412,a41da3f9,
1,00035108e64677,f3c2de19,code,"# import test and train file\neverything = pd.read_json(""../input/whats-cooking/train.json"")\ntest = pd.read_json(""....",0.088235,a41da3f9,


In [7]:
triplets = []
val_triplets = []

for lst in list(stream(np.array_split(df_train, PROCESSORS_COUNT)).mpmap(ssp.generate_training_triplet)):
    triplets.extend(lst)
    
for lst in list(stream(np.array_split(df_valid, PROCESSORS_COUNT)).mpmap(ssp.generate_testing_triplet)):
    val_triplets.extend(lst)

print(len(triplets), len(val_triplets))

89368 17111


In [8]:
# raise

In [9]:
# BERT_MODEL_NAME = "microsoft/codebert-base"
BERT_MODEL_NAME = 'distilbert-base-uncased'
# BERT_MODEL_NAME = "microsoft/graphcodebert-base"

# OPTIMIZER = 'adam'
OPTIMIZER = 'nadam'

# triplets = ssp.generate_triplet(df_train)
# val_triplets = ssp.generate_triplet(df_valid, mode = 'test')

train_ds = ssp.BDataset(
    df=triplets, 
    source_dict=dict_cellid_source,
    bert_model_name=BERT_MODEL_NAME,
    max_len=MAX_LENGTH,
    total_max_len=TOTAL_MAX_LEN,
)
val_ds = ssp.BDataset(
    df=val_triplets,    
    source_dict=dict_cellid_source,
    bert_model_name=BERT_MODEL_NAME,
    max_len=MAX_LENGTH,
    total_max_len=TOTAL_MAX_LEN,
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=PROCESSORS_COUNT, pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE * 4, shuffle=False, num_workers=PROCESSORS_COUNT, pin_memory=False, drop_last=False)

In [10]:
model = ssp.BModel(BERT_MODEL_NAME).cuda()
model, ypred = ssp.train(
    model, 
    train_loader, 
    val_loader, 
    epochs=EPOCHS, 
    path=os.path.join(models_dir, 'single_bert_checkpoint.pt')
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1 Loss: 0.0560025 lr: 1e-05:   1%|          | 105/11171 [01:05<1:55:53,  1.59it/s]



KeyboardInterrupt



In [None]:
df_valid["ranks"] = df_valid.groupby(["id", "cell_type"]).cumcount()
df_valid["pred"] = df_valid.groupby(["id", "cell_type"])["rank"].rank(pct=False)

pred_vals = []
count = 0
for id, df_tmp in tqdm(df_valid.groupby('id')):
    df_tmp_mark = df_tmp[df_tmp['cell_type'] == 'markdown']
    df_tmp_code = df_tmp[df_tmp['cell_type'] != 'markdown']
    df_tmp_code_rank = df_tmp_code['rank'].rank().values
    # print('Code Rank', df_tmp_code_rank.shape)
    N_code = len(df_tmp_code_rank)
    N_mark = len(df_tmp_mark)

    preds_tmp = ypred[count : count + N_mark * N_code]
    
    count += N_mark * N_code

    for i in range(N_mark):
        # print(i, N_code)
        # print(i * N_code)
        # print(i * N_code + N_code)
        # print('preds_tmp', preds_tmp.shape)
        pred = preds_tmp[i * N_code : i * N_code + N_code] 
        # if len(pred) < N_code:
        #     # print(len(preds_tmp))
        #     continue
        # print('Preds', pred.shape)
        softmax = np.exp((pred - np.mean(pred)) * 20) / np.sum(np.exp((pred - np.mean(pred)) * 20)) 
        # print('softmax', softmax.shape)
        rank = np.sum(softmax * df_tmp_code_rank[:len(pred)])
        pred_vals.append(rank)

print(len(df_valid.loc[df_valid["cell_type"] == "markdown", "pred"]), len(pred_vals))
df_valid.loc[df_valid["cell_type"] == "markdown", "pred"] = pred_vals
y_dummy = df_valid.sort_values("pred").groupby('id')['cell_id'].apply(list)
print('Final accuracy for code is:', ssp.kendall_tau(df_orders.loc[y_dummy.index], y_dummy))