In [1]:
import json, os, glob, psutil, sys
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
from tqdm import notebook
notebook.tqdm.pandas()
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import GroupShuffleSplit

from pyxtension.streams import stream
import swifter

from copy import deepcopy

import nltk
try:
    nltk.download('wordnet')
    nltk.download('omw-1.4')
except:
    print("Could't donlowd requaired text data")

print('__CUDA VERSION:', torch.version.cuda)
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.empty_cache())
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
    


import strongercodebase_v2 as ssp

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yazee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yazee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


__CUDA VERSION: 11.5
__CUDNN VERSION: 8302
True
None
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|----------

In [2]:
RANDOM_SEED = 42
PROCESSORS_COUNT = psutil.cpu_count(logical=False)
np.random.seed(RANDOM_SEED)


data_dir = Path('.')
train_dir = os.path.join(data_dir, 'train')
models_dir = os.path.join(data_dir, 'pt_models')
orders_path = os.path.join(data_dir, 'train_orders.csv')
ancestors_path = os.path.join(data_dir, 'train_ancestors.csv')

MKDN = 'markdown'
CODE = 'code'

count = len(list(glob.iglob(os.path.join(train_dir, '*.json'))))
NUM_TRAIN = int(count * 0.15) + 1 # 0.15
NUM_TRAIN = 1000
MULTI = PROCESSORS_COUNT * int(str(NUM_TRAIN) [:-2])

print(f"\033[94mNumber of notebooks present in train set  = ", count)
print(f"\033[94mNumber of notebooks contribute in training = ", NUM_TRAIN)

[94mNumber of notebooks present in train set  =  139256
[94mNumber of notebooks contribute in training =  1000


In [3]:
df = ssp.read_all_notebooks_(train_dir, NUM_TRAIN, PROCESSORS_COUNT)
display(df.head())
print('-' * 125)

# Read Ordering data
df_orders = pd.read_csv(
    orders_path,
    index_col='id',
)
df_orders['cell_order'] = df_orders['cell_order'].str.split()  # Split the string representation of cell_ids into a list
df_orders = df_orders.squeeze(axis=1)


# build ranks as integers 
df = df.join(ssp.build_ranks_(df_orders, df, PROCESSORS_COUNT))


# Read Ancestors data
df = df.reset_index().merge(pd.read_csv(ancestors_path,  index_col='id'), on=["id"])

# convert integer ranks to percentages 
df["rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")
display(df.head())


df = pd.concat(stream(np.array_split(df, PROCESSORS_COUNT)).mpmap(ssp.add_style_specific_counts))

df = pd.concat(stream(np.array_split(df, PROCESSORS_COUNT)).mpmap(ssp.get_features)).reset_index(drop=True)

display(df.head())

Train NBs: 100%|██████████| 1000/1000 [00:06<00:00, 158.67it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros..."
00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore')
00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14})
00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\..."


-----------------------------------------------------------------------------------------------------------------------------


100%|██████████| 1000/1000 [00:00<00:00, 50010.78it/s]


Pandas Apply:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...,0.0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros...",0.034483,945aea18,
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),0.068966,945aea18,
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),0.103448,945aea18,
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",0.137931,945aea18,


Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,n_code_cells,n_markdown_cells,codes
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...,0.0,945aea18,,30,28,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...
1,000757b90aaca0,1c301fa2,markdown,"# analys text similar spacy, networkx notebook demonstr way spaci conduct rapid themat analysi small corpu comments,...",0.0,ff3e6f37,,22,20,import pandas as pd\nimport spacy\nimport networkx as nx # a really useful network analysis ...
2,0007f21ee357b5,e68e6c44,code,import numpy as np \nimport pandas as pd \nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.naiv...,0.0,5d4bdf92,,79,9,import numpy as np \nimport pandas as pd \nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.naiv...
3,002bcc9e2f9077,c5c6a0a5,markdown,<h1>introduction</h1> notebook cover data stich text similar techniques. work databas multipl tables. common `id` `k...,0.0,f0446ffd,,18,9,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...
4,002bbeec78c962,0329b907,markdown,### import packages:,0.0,f31585a0,,15,10,"import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimpor..."


In [4]:
VALIDATION_RATIO = 0.15
MODEL_USELESS = [ 'ancestor_id', 'parent_id', ] # 'id', 'cell_type',

from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(n_splits=1, test_size=VALIDATION_RATIO, random_state=RANDOM_SEED)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids_train, ids_valid = next(splitter.split(df, groups=df["ancestor_id"]))
print(ids_train[:5], ids_valid[:5])

# extract code cells for each notebook
df_train = df.loc[ids_train, :].reset_index(drop=True).drop(MODEL_USELESS, axis=1)
df_valid = df.loc[ids_valid, :].reset_index(drop=True).drop(MODEL_USELESS, axis=1)
display(df_train.head(2))
display(df_valid.head(2))

[0 2 3 4 5] [ 1 10 17 26 35]


Unnamed: 0,id,cell_id,cell_type,source,rank,n_code_cells,n_markdown_cells,codes
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...,0.0,30,28,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/p...
1,0007f21ee357b5,e68e6c44,code,import numpy as np \nimport pandas as pd \nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.naiv...,0.0,79,9,import numpy as np \nimport pandas as pd \nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.naiv...


Unnamed: 0,id,cell_id,cell_type,source,rank,n_code_cells,n_markdown_cells,codes
0,000757b90aaca0,1c301fa2,markdown,"# analys text similar spacy, networkx notebook demonstr way spaci conduct rapid themat analysi small corpu comments,...",0.0,22,20,import pandas as pd\nimport spacy\nimport networkx as nx # a really useful network analysis ...
1,0029a37d79568a,5ae20870,markdown,notebook' train loop minor issu loss function fork 200+ times.<br/> https://www.kaggle.com/maunish/clrp-pytorch-robe...,0.0,10,1,!pip install accelerateSEPERATOR_STRIG_TAGimport os\nimport gc\nimport sys\nimport math\nimport time\nimport tqdm\ni...


In [5]:
MAX_LENGTH = 256 
EPOCHS = 1 # 5
TOTAL_MAX_LEN = 512

# BERT_MODEL_NAME = "microsoft/codebert-base"
BERT_MODEL_NAME = 'distilbert-base-uncased'
# BERT_MODEL_NAME = "microsoft/graphcodebert-base"

# OPTIMIZER = 'adam'
OPTIMIZER = 'nadam'

In [6]:
#largest_divisor = 0
#for i in range(2, len(df_train)):
#    if largest_divisor >= 10: break
#    if len(df_train) % i == 0:
#        largest_divisor = i
#BATCH_SIZE = largest_divisor
#print(BATCH_SIZE)
BATCH_SIZE = 16
BATCH_SIZE

16

In [7]:
train_ds = ssp.BDataset(
    df_train, 
    bert_model_name=BERT_MODEL_NAME,
    max_len=MAX_LENGTH,
    total_max_len=TOTAL_MAX_LEN,
)
val_ds = ssp.BDataset(
    df_valid,    
    bert_model_name=BERT_MODEL_NAME,
    max_len=MAX_LENGTH,
    total_max_len=TOTAL_MAX_LEN,
)
print(len(train_ds), len(val_ds), )


train_loader = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=PROCESSORS_COUNT, 
    pin_memory=False, 
    drop_last=True
)
val_loader = DataLoader(
    val_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=PROCESSORS_COUNT, 
    pin_memory=False, 
    drop_last=False
)

print(len(train_loader), len(val_loader), )

39805 6709
2487 420


In [8]:
model = ssp.BModel(BERT_MODEL_NAME).cuda()
model, ypred = ssp.train(
    model, 
    train_loader, 
    val_loader, 
    epochs=EPOCHS, 
    path=os.path.join(models_dir, 'single_bert_checkpoint.pt'),
    accumulation_steps=2,
)

df_valid["pred"] = df_valid.groupby(["id"])["rank"].rank(pct=True)
df_valid["pred"] = ypred
y_dummy = df_valid.sort_values("pred").groupby('id')['cell_id'].apply(list)
print('Final accuracy for code is:', ssp.kendall_tau(df_orders.loc[y_dummy.index], y_dummy))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1 Loss: 0.698312 lr:0.001 :   0%|          | 1/2487 [00:36<25:01:53, 36.25s/it]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 8.00 GiB total capacity; 4.63 GiB already allocated; 1.71 GiB free; 4.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
# sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
# sub_df.head()