In [1]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120
data_dir = Path('../')

In [2]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
notebooks_test

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 181.79it/s]


[         cell_type                                                                                                                   source              id
 cell_id                                                                                                                                                    
 ddfd239c      code  import numpy as np # linear algebra\nimport pandas as pd # data processing,\nimport matplotlib.pyplot as plt\nfrom s...  0009d135ece78d
 c6cd22db      code                                              df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')\ndf  0009d135ece78d
 1372ae9b      code  numerical_data = df.loc[:, ~df.columns.isin(['id', "diagnosis"])]\n\nlabels = df["diagnosis"].factorize(['B','M'])[0...  0009d135ece78d
 90ed07ab      code  def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):\n    # Scaling Data for testing\n    ...  0009d135ece78d
 7f388a41      code  # Ploting data with different columns

In [3]:
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
test_df

Unnamed: 0,id,cell_id,cell_type,source,rank,pred
0,0009d135ece78d,ddfd239c,code,"import numpy as np # linear algebra\nimport pandas as pd # data processing,\nimport matplotlib.pyplot as plt\nfrom s...",0,0.142857
1,0009d135ece78d,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')\ndf,1,0.285714
2,0009d135ece78d,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['id', ""diagnosis""])]\n\nlabels = df[""diagnosis""].factorize(['B','M'])[0...",2,0.428571
3,0009d135ece78d,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):\n # Scaling Data for testing\n ...",3,0.571429
4,0009d135ece78d,7f388a41,code,"# Ploting data with different columns\n#####################################\ncomparison_plot_maker(numerical_data[""...",4,0.714286
...,...,...,...,...,...,...
84,0010a919d60e4f,d3f5c397,markdown,We have 177 rows with missing `Age` and 687 rows with missing `Cabin`,34,1.000000
85,0028856e09c5b7,012c9d02,code,"sns.set()\nsns.pairplot(data1, 2.5)\nplt.show(); = size",0,0.333333
86,0028856e09c5b7,d22526d1,code,"types----------"")\n# is uniques----------"")\n# plt\nimport mis_val +\n = #https://pandas.pydata.org/pandas...",1,0.666667
87,0028856e09c5b7,3ae7ece3,code,"#correlation avoid map\nf,ax verbose 20), 18))\nsns.heatmap(data1.corr(), the annot=True, ; informations bins=50, '....",2,1.000000


In [4]:
import re
# 清洗
def clean_code(cell):
    cleaned_code = re.sub(r"^#.*\n" , " " , str(cell) , flags=re.MULTILINE) #第一次去除 #类注释
    cleaned_code = re.sub(r'""".+"""', ' ', cleaned_code) # 去除 “”“”“”类型注释
    cleaned_code = re.sub(r' +', ' ', cleaned_code) # 去除多个空格那类型
    cleaned_code.replace("\\n" , " ")
    cleaned_code = cleaned_code.replace("\n" , " ") # 去除换行符
    return cleaned_code

# 从所有数据中取样cell
def sample_cells(cells , n):
    """
        cells: 所有cell，一般是code cell
        n: 筛选n个cell数据
    """
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        results = []
        step = len(cells) / n # 步长
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results #触发严重错误时报异常
        # 避免np.round的时候有点误差，把最后一个设置为cells中的最后一个元素
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results

# 获取一个notebook中的概括性特征
def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx , sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()
        num_of_markdown = sub_df[sub_df["cell_type"] == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df["cell_type"] == "code"]
        num_of_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df["source"].values , 20) # 取样20条code cell
        features[idx]["num_of_code"] = num_of_code
        features[idx]["num_of_markdown"] = num_of_markdown
        features[idx]["codes"] = codes
    return features

In [9]:
test_fts = get_features(test_df)

100%|██████████| 4/4 [00:00<00:00, 571.08it/s]


### Model

In [11]:
import torch.nn.functional as F
import torch.nn as nn
import torch
from transformers import AutoModel

class MarkdownModel(nn.Module):
    def __init__(self , model_path):
        super(MarkdownModel , self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769 , 1)
    def forward(self , ids , mask , fts):
        print("fts: " , fts)
        x = self.model(ids , mask)[0]
        print("x after model : " , x , "\ndim:" , x.size())
        x = torch.cat((x[: , 0 , :] , fts) , 1)
        print("x after cat : " , x , "\ndim:" , x.size())
        x = self.top(x)
        return x

### Dataset

In [6]:
from torch.utils.data import DataLoader , Dataset
import torch
from transformers import AutoTokenizer

class MarkdownDataset(Dataset):
    def __init__(self , df , model_name_or_path , total_max_len , markdown_max_len , features):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.markdown_max_len = markdown_max_len
        self.total_max_len = total_max_len
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.features = features
    def __getitem__(self, index):
        row = self.df.iloc[index]
        inputs = self.tokenizer.encode_plus(
            row["source"],
            add_special_tokens=True,
            max_length=self.markdown_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.features[row.id]["codes"]],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True
        )
        num_markdown = self.features[row.id]["num_of_markdown"]
        num_code = self.features[row.id]["num_of_code"]
        if num_markdown + num_code == 0:
            markdown_rate = torch.Tensor([0])
        else:
            markdown_rate = torch.FloatTensor([num_markdown / (num_code + num_markdown)])
        
        # 准备input_ids
        """
            末尾追加code cell翻译的inputs ， 由于encode的时候设置了“add_specical_tokens=True”，所以自带<s>
            最后形成<s> Markdown content <s> Code content 1 <s> Code content 2 <s> ... <s> Code content 20 <s>
        """
        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1]) 
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id ,] * (self.total_max_len - len(ids)) # 添加<padding>补齐到total_max_len
        ids = torch.LongTensor(ids)

        # 准备attention_mask,这个步骤需要与上面那个同步的
        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len
        assert len(mask) == self.total_max_len

        return ids , mask , markdown_rate , torch.FloatTensor([row.pct_rank])
    
    def __len__(self):
        return self.df.shape[0]

In [7]:
import sys , os

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)
            print("inputs: " , inputs)
            print("*inputs :" , *inputs)
            pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)

def predict(model_path, ckpt_path):
    model = MarkdownModel(model_path)
    model = model.cuda()
    model.eval()
    model.load_state_dict(torch.load(ckpt_path))
    BS = 32
    MAX_LEN = 64
    test_df["pct_rank"] = 0
    test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), markdown_max_len=64,total_max_len=512, model_name_or_path=model_path, features=test_fts)
    test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False,
                              pin_memory=False, drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

### Predict-demo

In [12]:
model_path = 'microsoft/codebert-base'
ckpt_path = "./output/model_demo.bin"
y_test_2 = predict(model_path, ckpt_path)
y_test_2

  0%|          | 0/2 [00:00<?, ?it/s]inputs:  (tensor([[    0, 10431,  2741,  ...,     1,     1,     1],
        [    0, 48342, 25980,  ...,     1,     1,     1],
        [    0, 48342, 39154,  ...,     1,     1,     1],
        ...,
        [    0, 48342, 44457,  ...,     1,     1,     1],
        [    0,   170,    40,  ...,     1,     1,     1],
        [    0,   170,    67,  ...,     1,     1,     1]], device='cuda:0'), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), tensor([[0.4615],
        [0.4615],
        [0.4615],
        [0.4615],
        [0.4615],
        [0.4615],
        [0.1000],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.5645],
        [0.56

array([0.45040798, 0.44999337, 0.45102197, 0.4510573 , 0.45111072,
       0.4496126 , 0.44323465, 0.47450334, 0.47357708, 0.47483248,
       0.4729226 , 0.47534183, 0.4742913 , 0.4745975 , 0.47512558,
       0.47534359, 0.47370237, 0.47505045, 0.47321275, 0.47482842,
       0.4749407 , 0.4749471 , 0.47602025, 0.47424853, 0.47575235,
       0.47485822, 0.47630787, 0.4760728 , 0.4762965 , 0.47409356,
       0.47583067, 0.47426465, 0.4737188 , 0.47446513, 0.47490484,
       0.47491616, 0.47488385, 0.47325808, 0.47649592, 0.47631967,
       0.4753911 , 0.47637933, 0.44393614], dtype=float32)

In [10]:
y_test = y_test_2
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,ddfd239c c6cd22db 1372ae9b 8cb8d28a ba55e576 f9893819 39e937ec e25aa9bd 0a226b6a 90ed07ab 7f388a41 2843a25a 06dbf8cf
1,0010483c12ba9b,54c7cab3 fe66203e 7844d5f8 7f270e34 5ce8863c 4a0777c4 4703bb6d 4a32c095 865ad516 02a0be6d
2,0010a919d60e4f,aafc3d23 80e077ec b190ebb4 ed415c3c 322850af c069ed33 868c4eae 80433cf3 bd8fbd76 0e2529e8 1345b8b2 cdae286f bac960d3...
3,0028856e09c5b7,012c9d02 eb293dfc d22526d1 3ae7ece3


In [12]:
sub_df.to_csv("submission_demo.csv", index=False)