In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

Fri Jul 22 13:22:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from zipfile import ZipFile
with ZipFile('/content/drive/MyDrive/GoogleA14/AI4Code (1).zip') as z:
  z.extractall()

In [None]:
# !gdown --id '1ehzzD7WAvSwd_mZxawsz9IYzGxR66oA1&export=download' --output GoogleA14.zip

# !unzip -o GoogleA14.zip

# !nvidia-smi

In [4]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('/content')

In [5]:
import random
import torch
import os 
def same_seeds(seed):
	  torch.manual_seed(seed)
	  if torch.cuda.is_available():
		    torch.cuda.manual_seed(seed)
		    torch.cuda.manual_seed_all(seed)
	  np.random.seed(seed)
	  random.seed(seed)
	  torch.backends.cudnn.benchmark = False
	  torch.backends.cudnn.deterministic = True

same_seeds(0)

# Preprocessing

In [6]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import os

NUM_LIMIT = 50000
def read_notebook(path):  
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
            .assign(id=path.stem)
            .rename_axis('cell_id')
    )

def get_ranks(base, derived):
    assert type(base)==list, print(base,'/n', derived)
    return [base.index(d) for d in derived]

if NUM_LIMIT == None:
  try :
    df = pd.read_csv('/content/drive/MyDrive/GoogleA14/df.csv')

    df_orders = pd.read_csv(
      data_dir / 'train_orders.csv',
      index_col='id',
      squeeze=True,
    ).str.split()  # Split the string representation of cell_ids into a list

    df_orders_ = df_orders.to_frame().join(
        df.groupby('id')['cell_id'].apply(list),
        how='right',
    ).dropna()
  except:
    paths_train = list((data_dir / 'train').glob('*.json'))
    notebooks_train = [
      read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
    ]
    df = (
        pd.concat(notebooks_train)
            .set_index('id', append=True)
            .swaplevel()
            .sort_index(level='id', sort_remaining=False)
    )

    df.to_csv('/content/drive/MyDrive/GoogleA14/df.csv')

    df_orders = pd.read_csv(
        data_dir / 'train_orders.csv',
        index_col='id',
        squeeze=True,
    ).str.split()  # Split the string representation of cell_ids into a list

    df_orders_ = df_orders.to_frame().join(
        df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
        how='right',
)
else:
  paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_LIMIT]
  notebooks_train = [
       read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')              
  ]
  df = (
        pd.concat(notebooks_train)
            .set_index('id', append=True)
            .swaplevel()
            .sort_index(level='id', sort_remaining=False)
    )
  df_orders = pd.read_csv(
      data_dir / 'train_orders.csv',
      index_col='id',
      squeeze=True,
  ).str.split()  # Split the string representation of cell_ids into a list

  df_orders_ = df_orders.to_frame().join(
      df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
      how='right',
)


ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    try:
      rank = get_ranks(cell_order, cell_id)
    except:
      print('cell_order:', cell_order, type(cell_order))
      print('cell_id:', cell_id, type(cell_id))
      get_ranks(cell_order, cell_id)
    ranks[id_] = {'cell_id': cell_id, 'rank': rank }
df_ranks = (
    pd.DataFrame
        .from_dict(ranks, orient='index')
        .rename_axis('id')
        .apply(pd.Series.explode)
        .set_index('cell_id', append=True)
)

df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")

from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set
splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)
train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))
train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)



Train NBs: 100%|██████████| 50000/50000 [03:06<00:00, 268.78it/s]


In [7]:
# Base markdown dataframes
try:
  os.makedirs('/content/data')
except:
  pass
train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)
train_df_mark.to_csv("/content/data/train_mark.csv", index=False)
val_df_mark.to_csv("/content/data/val_mark.csv", index=False)
val_df.to_csv("/content/data/val.csv", index=False)
train_df.to_csv("/content/data/train.csv", index=False)

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = WordNetLemmatizer()

# Additional code cells
def clean_code(cell):
    str(cell).replace("\\n", "\n")
    try:
      tokens = cell.split()
    except:
      print(cell)
    
    tokens = [stemmer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text



def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]      ####################
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

val_fts = get_features(val_df)
json.dump(val_fts, open("/content/data/val_fts.json","wt"))
train_fts = get_features(train_df)
json.dump(train_fts, open("/content/data/train_fts.json","wt"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
100%|██████████| 5017/5017 [00:19<00:00, 253.73it/s]
100%|██████████| 44983/44983 [02:37<00:00, 284.74it/s]


In [8]:
train_df_order = train_df.sort_values(by = ['id', 'pct_rank'])

# Dataset

In [9]:
from torch.utils.data import DataLoader, Dataset
import torch
try:
  from transformers import AutoTokenizer
except:
  !pip install transformers
  from transformers import AutoTokenizer

class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
        self.fts = fts

    def __getitem__(self, index):
        row = self.df.iloc[index]
        md = row.cell_id

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=False,
            max_length=self.md_max_len,
            padding=False,
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=False,
            max_length=23,
            padding=False,
            truncation=True
        )
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_code"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = [101] + inputs['input_ids'] + [102]
        md_len = len(ids)
        for x in code_inputs['input_ids']:
            ids.extend(x)
            ids.extend([102])    #############
            
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
          ids, mask = self.padding(md_len, ids)
        else:
          mask = torch.LongTensor([1]*len(ids))
          ids = torch.LongTensor(ids)
        
        assert len(ids) == self.total_max_len, print('len(ids) is not same to total_max_len')
        assert len(ids) == len(mask), print('len(ids):', len(ids), 'len(mask):', len(mask))

        return ids, mask, fts,  torch.FloatTensor([row.pct_rank]), row.id, md

    def __len__(self):
        return self.df.shape[0]
    
    def padding(self, md_len, ids):
      all_len = len(ids)
      ids_ = ids + [1]*(self.total_max_len - all_len)
      mask = [1]*all_len + [0]*(self.total_max_len - all_len)
      return torch.LongTensor(ids_), torch.LongTensor(mask) 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 32.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 90.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 85.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [10]:
model_name_or_path = '/content/drive/MyDrive/GoogleA14/codebert-base'
train_mark_path = '/content/data/train_mark.csv'
train_features_path = '/content/data/train_fts.json'
val_mark_path = '/content/data/val_mark.csv'
val_features_path = '/content/data/val_fts.json'
val_path = "/content/data/val.csv"

val_steps =100 
md_max_len = 64
total_max_len = 512
batch_size = 8
accumulation_steps = 4
n_workers = 4
early_stop_step = 100

In [11]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import sys, os
import torch



try:
  os.mkdir("/content/outputs")
except:
  pass

train_df_mark = pd.read_csv(train_mark_path).drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open(train_features_path))
val_df_mark = pd.read_csv(val_mark_path).drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open(val_features_path))
val_df = pd.read_csv(val_path)

order_df = pd.read_csv("/content/train_orders.csv").set_index("id")
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()

train_ds = MarkdownDataset(train_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                           total_max_len=total_max_len, fts=train_fts)
val_ds = MarkdownDataset(val_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                         total_max_len=total_max_len, fts=val_fts)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=n_workers,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=n_workers,
                        pin_memory=False, drop_last=False)




In [None]:
for i in train_loader:
  print(len(i[0]))
  break

8


# Model

In [12]:
import torch.nn.functional as F
import torch.nn as nn
import torch
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup


class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        # for name, param in self.model.named_parameters():
        #     param.requires_grad = False # unfreeze weights in at all
        #     if param.requires_grad == False:
        #       print('Freeze', name)
        #     else:
        #       print('Fail Freezing')
        
        self.top = nn.Linear(769, 1)
        self.dropout = nn.Dropout(0.5)


    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = torch.cat((x[:, 0, :], fts), 1)
        x = self.dropout(x)
        x = self.top(x)
        x = torch.sigmoid(x)
        return x

# Metrics


In [13]:
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

# Training

In [None]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-3]), data[-3].cuda(), data[-2], data[-1]

def correct_place_or_not(preds, id_list, md_list):
  for pred, id, md in zip(preds, id_list, md_list):
    print('pred', pred)
    print('id', id)
    print('md', md)

    id_df = train_df_order[train_df_order['id'] == id]
    md_rank = id_df[id_df['cell_id'] == md].rank
    md_pct = id_df[id_df['cell_id'] == md].pct_rank
    correct_pct = id_df[id_df['cell_id'] == md].pct_rank
    

    if md_rank != 0:
      previous_pct = id_df[id_df['rank'] == (md_rank-1)].pct_rank
    if md_rank != (len(id_df) - 1):
      follow_pct = id_df[id_df['rank'] == (md_rank+1)].pct_rank
    if previous_pct:
      if md_pct < previous_pct:
        loss = previous_pct - md_pct
        return loss
    elif follow_pct:
      if md_pct > follow_pct:
        loss = md_pct - follow_pct
        return loss
    else:
      return 0

  

def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target, id_list, md_list  = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    model.train()
    return np.concatenate(labels), np.concatenate(preds)


def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)
    # Creating optimizer and lr schedulers
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ] ######################################

    num_train_optimization_steps = int(epochs * len(train_loader) / accumulation_steps)
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5,
                      correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                                num_training_steps=num_train_optimization_steps)  # PyTorch scheduler

    criterion = torch.nn.L1Loss()
    scaler = torch.cuda.amp.GradScaler()
    scores = []
    max_score = 0 
    early_stop = 0 
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target, id_list, md_list = read_data(data)

            
            with torch.cuda.amp.autocast():
                pred = model(*inputs)
                loss = criterion(pred, target)
            
            #pct_loss = correct_place_or_not(pred, id_list, md_list)
            # loss = (loss+pct_loss) / accumulation_steps

            loss = loss / accumulation_steps

            
            scaler.scale(loss).backward()
            if idx % accumulation_steps == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            # if idx & val_steps ==0 or  idx == len(tbar) - 1:
            #   y_val, y_pred = validate(model, val_loader)
            #   val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
            #   val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
            #   y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
            #   score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)
            #   scores.append(score)
            #   if score > min_score:
            #     max_score = score
            #     torch.save(model.state_dict(), '/content/drive/MyDrive/GoogleA14/listwise_codebert.bin')
            #     print("Preds score", score)
            #     early_stop_step = 0
            #   else:
            #     if early_stop == early_stop_step:
            #       break 
            #     else:
            #       early_stop_step += 1


            tbar.set_description(f"Epoch {e + 1} Loss: {loss} lr: {scheduler.get_last_lr()}")

        y_val, y_pred = validate(model, val_loader)
        val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
        val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
        y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
        score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)
        print("Final Preds score", score)
        torch.save(model.state_dict(), '/content/drive/MyDrive/GoogleA14/listwise_codebert.bin')
        
        # scores.append(score)
        # if score > min_score:
        #    max_score = score
        #    torch.save(model.state_dict(), '/content/drive/MyDrive/GoogleA14/listwise_codebert.bin')

    return model, y_pred ,scores
same_seeds(0)
model = MarkdownModel(model_name_or_path)
model = model.cuda()
model, y_pred , scores= train(model, train_loader, val_loader, epochs=1)
 



Epoch 1 Loss: 0.03856188431382179 lr: [2.2268076981419192e-05, 2.2268076981419192e-05]:  29%|██▉       | 25778/87437 [2:12:10<5:16:29,  3.25it/s]