In [1]:
import json, os, glob, psutil, sys
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
from tqdm import notebook
notebook.tqdm.pandas()
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import GroupShuffleSplit

from pyxtension.streams import stream
import swifter

from copy import deepcopy

import nltk
try:
    nltk.download('wordnet')
    nltk.download('omw-1.4')
except:
    print("Could't donlowd requaired text data")

print('__CUDA VERSION:', torch.version.cuda)
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.empty_cache())
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
    


import markdown_only as mol
#import pt_model as m
#import strongercodebase_v2 as ssp

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yazee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yazee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


__CUDA VERSION: 11.5
__CUDNN VERSION: 8302
True
None
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|----------

In [2]:
LOAD_NUM = 100
RANDOM_SEED = 42
PROCESSORS_COUNT = psutil.cpu_count(logical=False)
MULTI = PROCESSORS_COUNT * int(str(LOAD_NUM) [:-2])

np.random.seed(RANDOM_SEED)

data_dir = Path('.')
train_dir = os.path.join(data_dir, 'train')

models_dir = os.path.join(data_dir, 'pt_models')
orders_path = os.path.join(data_dir, 'train_orders.csv')
ancestors_path = os.path.join(data_dir, 'train_ancestors.csv')



# shutil.rmtree(models_dir)
# if not os.path.exists(models_dir):
#     os.mkdir(models_dir)
count = len(list(glob.iglob(os.path.join(train_dir, '*.json'))))
# LOAD_NUM = int(count * 0.1) + 1
print(f"\033[94mNumber of notebooks present in train set  = ", count)
print(f"\033[94mNumber of notebooks contribute in training = ", LOAD_NUM)

[94mNumber of notebooks present in train set  =  139256
[94mNumber of notebooks contribute in training =  100


In [3]:
df = mol.read_all_notebooks_(train_dir, LOAD_NUM, PROCESSORS_COUNT)

df = pd.concat(stream(np.array_split(df, PROCESSORS_COUNT)).mpmap(mol.extract_features))

display(df.head())

print('-' * 125)

# Read Ordering data
df_orders = pd.read_csv(
    orders_path,
    index_col='id',
)
df_orders['cell_order'] = df_orders['cell_order'].str.split()  # Split the string representation of cell_ids into a list
df_orders = df_orders.squeeze(axis=1)


# build ranks as integers 
df = df.join(mol.build_ranks_(df_orders, df, PROCESSORS_COUNT))


# Read Ancestors data
df = df.reset_index().merge(pd.read_csv(ancestors_path,  index_col='id'), on=["id"])

# convert integer ranks to percentages 
df["rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")

display(df.head())

Train NBs: 100%|██████████| 100/100 [00:00<00:00, 20056.92it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source,n_code_cells,n_markdown_cells,words_count,letters_count,empty_lines_count,comment_lines_count,full_lines_count,text_lines_count,tag_lines_count
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...,0.517241,0.482759,140,930,0.235294,0.411765,0.352941,0.0,0.0
00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\nfrom sklearn.model_selection import train_test_split, cross_...",0.517241,0.482759,55,498,0.176471,0.0,0.823529,0.0,0.0
00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),0.517241,0.482759,3,49,0.0,0.0,1.0,0.0,0.0
00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),0.517241,0.482759,2,45,0.0,0.0,1.0,0.0,0.0
00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",0.517241,0.482759,39,694,0.052632,0.0,0.947368,0.0,0.0


-----------------------------------------------------------------------------------------------------------------------------


100%|██████████| 100/100 [00:00<00:00, 49431.99it/s]


Pandas Apply:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,id,cell_id,cell_type,source,n_code_cells,n_markdown_cells,words_count,letters_count,empty_lines_count,comment_lines_count,full_lines_count,text_lines_count,tag_lines_count,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...,0.517241,0.482759,140,930,0.235294,0.411765,0.352941,0.0,0.0,0.0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\nfrom sklearn.model_selection import train_test_split, cross_...",0.517241,0.482759,55,498,0.176471,0.0,0.823529,0.0,0.0,0.034483,945aea18,
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),0.517241,0.482759,3,49,0.0,0.0,1.0,0.0,0.0,0.068966,945aea18,
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),0.517241,0.482759,2,45,0.0,0.0,1.0,0.0,0.0,0.103448,945aea18,
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",0.517241,0.482759,39,694,0.052632,0.0,0.947368,0.0,0.0,0.137931,945aea18,


In [4]:
raise

RuntimeError: No active exception to reraise

In [6]:
# for percentages 
INTREST_PERCENT = 0.96
CODE_TYPE = 'code'
MKDN_TYPE = 'markdown'
VALIDATION_RATIO = 0.15

NOT_GENERATED_COLUMNS = ['id', 'cell_id', 'source', 'cell_type', 'rank', 'ancestor_id', 'parent_id', ]
MODEL_USELESS = ['id', 'cell_id', 'cell_type', 'ancestor_id', 'parent_id', ]
GENERATED_COLUMNS_COUNT = len(df.drop(['id', 'cell_id', 'source', 'cell_type', 'rank', 'ancestor_id', 'parent_id', ], axis=1).columns)





# BERT_MODEL_NAME = "microsoft/codebert-base"
BERT_MODEL_NAME = 'distilbert-base-uncased'
# BERT_MODEL_NAME = "microsoft/graphcodebert-base"


# OPTIMIZER = 'adam'
OPTIMIZER = 'nadam'


# model run
MAX_LENGTH = int(df[df.cell_type == MKDN_TYPE].words_count.quantile(INTREST_PERCENT)) 
# MAX_LENGTH = 128
BERT_OUTPUT_FEATURES = MAX_LENGTH + GENERATED_COLUMNS_COUNT

print("Words counts to cover {percent} is: {count}".format(percent=INTREST_PERCENT, count=MAX_LENGTH))
print(f'Total number of fetures output from bert: {BERT_OUTPUT_FEATURES}')
BATCH_SIZE = 16
EPOCHS = 1
TOTAL_MAX_LEN = 512
ACCUMULATION_SETPS = 3

Words counts to cover 0.96 is: 129
Total number of fetures output from bert: 138


In [15]:
splitter = GroupShuffleSplit(n_splits=1, test_size=VALIDATION_RATIO, random_state=RANDOM_SEED)

def extract_items(ids, data, cell_type):
    tmp = data.loc[ids, :].reset_index(drop=True)
    return tmp[tmp.cell_type == cell_type]


# Split, keeping notebooks with a common origin (ancestor_id) together
ids_train, ids_valid = next(splitter.split(df, groups=df["ancestor_id"]))
print(ids_train[:5], ids_valid[:5])

[0 1 2 3 4] [151 152 153 154 155]


In [None]:
# raise

In [17]:
# extract markdown cells for each notebook
df_train = df.loc[ids_train, :].reset_index(drop=True)
df_valid =  df.loc[ids_valid, :].reset_index(drop=True)
# print(mkdn_df_train[:5], mkdn_df_valid[:5])


# build markdown Dataset
train_ds = mol.BDataset(
    df_train[df_train.cell_type == MKDN_TYPE], 
    max_len=MAX_LENGTH, 
    bert_model_name=BERT_MODEL_NAME, 
    total_max_len=TOTAL_MAX_LEN, 
    drop=MODEL_USELESS,
)
val_ds = mol.BDataset(
    df_valid[df_valid.cell_type == MKDN_TYPE], 
    max_len=MAX_LENGTH, 
    bert_model_name=BERT_MODEL_NAME, 
    total_max_len=TOTAL_MAX_LEN,  
    drop=MODEL_USELESS,
)
# print(mkdn_train_ds[0], mkdn_val_ds[0])

# build markdown DataLoader
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=PROCESSORS_COUNT, pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=PROCESSORS_COUNT, pin_memory=False, drop_last=False)
# print(mkdn_train_loader, mkdn_val_loader)

In [21]:
########################################################################################################################
model = mol.BModel(
    BERT_MODEL_NAME, 
    GENERATED_COLUMNS_COUNT,
    # catch_path=models_dir,
).cuda()

model, y_pred = mol.train(
    model, 
    train_loader, 
    val_loader, 
    epochs=EPOCHS, 
    accumulation_steps=ACCUMULATION_SETPS, 
    model_name=BERT_MODEL_NAME,
    opt='nadam', 
    path=os.path.join(models_dir, 'markdown_bert_checkpoint.pt')
)

df_valid["pred"] = df_train.groupby(["id", "cell_type"])["rank"].rank(pct=True)
df_valid.loc[df_valid.cell_type == MKDN_TYPE, "pred"] = y_pred

########################################################################################################################

y_dummy = df_valid.sort_values("pred").groupby('id')['cell_id'].apply(list)
print('Final accuracy for markdown is:', mol.kendall_tau(df_orders.loc[y_dummy.index], y_dummy))

# # best_model_state = deepcopy(mkdn_model.state_dict())
# # t.save(best_model_state, f'./pt_models/markdown_model_state_dict.pt')
# # t.save(mkdn_model, f'./pt_models/markdown_model.pt')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1 Loss: 6.078826 lr:0.001 : 100%|██████████| 67/67 [00:47<00:00,  1.40it/s] 
100%|██████████| 19/19 [00:34<00:00,  1.82s/it]
Validation MAE: 1.10554

Final accuracy for markdown is: -0.06248210999795534


In [None]:
raise

In [None]:
CODE_TYPE = 'code'
MKDN_TYPE = 'markdown'

BERT_MODEL_NAME = 'distilbert-base-uncased'
PROCESSORS_COUNT = psutil.cpu_count(logical=False)

BATCH_SIZE = 8
TOTAL_MAX_LEN = 256




code_df_valid['rank'] = m.predict(
    model_path=BERT_MODEL_NAME,
    check_point=os.path.join(models_dir, 'code_bert_checkpoint.pt'), 
    batch_size=BATCH_SIZE, 
    num_workers=PROCESSORS_COUNT, 
    max_len=CD_MAX_LENGTH, 
    generated_columns_count=GENERATED_COLUMNS_COUNT,
    total_max_len=TOTAL_MAX_LEN, 
    data=code_df_valid, 
    drop=MODEL_USELESS,
)

mkdn_df_valid['rank'] = m.predict(
    model_path=BERT_MODEL_NAME,
    check_point=os.path.join(models_dir, 'markdown_bert_checkpoint.pt'), 
    batch_size=BATCH_SIZE, 
    num_workers=PROCESSORS_COUNT, 
    max_len=MK_MAX_LENGTH,
    generated_columns_count=GENERATED_COLUMNS_COUNT,
    total_max_len=TOTAL_MAX_LEN, 
    data=mkdn_df_valid, 
    drop=MODEL_USELESS
)

vres_df = pd.concat([mkdn_df_valid, code_df_valid], ignore_index=True, ).sort_values("rank").groupby("id")["cell_id"].apply(list)
print('Final total accuracy is:', r.kendall_tau(df_orders.loc[vres_df.index], vres_df))

display(vres_df.head())

In [None]:
raise

In [None]:
test_dir = os.path.join(data_dir, 'test')
print(f"\033[94mNumber of notebooks present in test set  = ", len(list(glob.iglob(os.path.join(test_dir, '*.json')))))

df_test = r.read_all_notebooks_(test_dir, 4, 2, desc="Tests NBs")

df_test = r.extract_features(df_test).reset_index()
df_test['rank'] = 0

MODEL_USELESS = ['id', 'cell_id', 'cell_type', ]

CODE_TYPE = 'code'
MKDN_TYPE = 'markdown'

BERT_MODEL_NAME = 'distilbert-base-uncased'
PROCESSORS_COUNT = psutil.cpu_count(logical=False)

BATCH_SIZE = 8
TOTAL_MAX_LEN = 256

df_test.loc[df_test.cell_type == CODE_TYPE, 'rank'] = m.predict(
    model_path=BERT_MODEL_NAME,
    check_point=os.path.join(models_dir, 'code_bert_checkpoint.pt'), 
    batch_size=BATCH_SIZE, 
    num_workers=PROCESSORS_COUNT, 
    max_len=CD_MAX_LENGTH, 
    generated_columns_count=GENERATED_COLUMNS_COUNT,
    total_max_len=TOTAL_MAX_LEN, 
    data=df_test[df_test.cell_type == CODE_TYPE], 
    drop=MODEL_USELESS
)

df_test.loc[df_test.cell_type == MKDN_TYPE, 'rank'] = m.predict(
    model_path=BERT_MODEL_NAME,
    check_point=os.path.join(models_dir, 'markdown_bert_checkpoint.pt'), 
    batch_size=BATCH_SIZE, 
    num_workers=PROCESSORS_COUNT, 
    max_len=MK_MAX_LENGTH, 
    generated_columns_count=GENERATED_COLUMNS_COUNT,
    total_max_len=TOTAL_MAX_LEN, 
    data=df_test[df_test.cell_type == MKDN_TYPE], 
    drop=MODEL_USELESS
)

df_test = df_test.sort_values("rank").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
df_test.rename(columns={"cell_id": "cell_order"}, inplace=True)
display(df_test.head())

df_test.to_csv("submission.csv", index=False)