## Import

In [2]:
import os
import subprocess

def download_and_unzip():
    if not os.path.exists('open.zip'):
        # Download data from Google Drive
        subprocess.run(['pip', 'install', 'gdown'], check=True)
        subprocess.run(['gdown', 'https://drive.google.com/uc?id=13WixS0gfcsb7NkKGje6QZA1phPlVhKQm'], check=True)
    if not os.path.exists('./data'):
        # Unzip to the 'data' directory
        subprocess.run(['unzip', 'open.zip', '-d', 'data'], check=True)
    
def download_lib():
    subprocess.run(['pip', 'install', '-r', 'requirements.txt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE,check=True)

download_and_unzip()
download_lib()

In [3]:
import pandas as pd
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

## Load Train / Test dataset

In [4]:
# 추가로 주어지는 Python Code 데이터들을 활용하여 새로운 Pair 쌍을 생성하여 더 많은 데이터로 학습할 수 있습니다.
# 베이스라인에서는 이미 Pair 쌍으로 구축되어 주어지는 sample 데이터들로 검증을 진행합니다.
val = pd.read_csv("./data/sample_train.csv")
val.head()

Unnamed: 0,code1_path,code2_path,code1,code2,similar
0,./train_code/problem393/problem393_19.cpp,./train_code/problem033/problem033_439.cpp,#include <bits/stdc++.h>\n\nusing namespace st...,#include <algorithm>\n#include <bitset>\n#incl...,0
1,./train_code/problem019/problem019_210.cpp,./train_code/problem019/problem019_63.cpp,#include <iostream>\n\nusing namespace std;\n\...,#include <iostream>\n#include <string>\nusing ...,1
2,./train_code/problem107/problem107_486.cpp,./train_code/problem107/problem107_340.cpp,#include <iostream>\n#include <vector>\nusing ...,#include <cstdio>\n#include <cstdlib>\n#includ...,1
3,./train_code/problem187/problem187_257.cpp,./train_code/problem403/problem403_135.cpp,#include <bits/stdc++.h>\n#include <unordered_...,#include <bits/stdc++.h>\nusing namespace std;...,0
4,./train_code/problem173/problem173_490.cpp,./train_code/problem173/problem173_345.cpp,#include <bits/stdc++.h>\ntypedef long long ll...,"#include ""bits/stdc++.h""\n#define rep(i,n) for...",1


In [5]:
test = pd.read_csv("./data/test.csv")
test.head()

Unnamed: 0,pair_id,code1,code2
0,TEST_000000,#include <bits/stdc++.h>\nusing namespace std;...,"#include <bits/stdc++.h>\n#define rep(i, n) fo..."
1,TEST_000001,"#include<bits/stdc++.h>\n#define rep(i,n)for(i...",// //bitset操作\n// #include <iostream>\n// #inc...
2,TEST_000002,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\n#include <ext/pb_ds/...
3,TEST_000003,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\nusing namespace std;...
4,TEST_000004,#include<bits/stdc++.h>\nusing namespace std;\...,#include<iostream>\n#include<algorithm>\n#incl...


## Define Model (CountVectorizer+CosineSimilarity)

In [6]:
class BaselineModel():
    def __init__(self, threshold=0.5):
        super(BaselineModel, self).__init__()
        self.threshold = threshold # 유사도 임계값
        self.vocabulary = set()

    def get_vectorizer(self):
        return CountVectorizer(vocabulary=list(self.vocabulary))
        
    def fit(self, code):
        # 입력 받은 학습 코드로 부터 vectorizer를 fit 시킵니다.
        temp_vectorizer = CountVectorizer()
        temp_vectorizer.fit(code)
        # fit 호출 마다 vectorizer에 활용할 vocabulary 업데이트
        self.vocabulary.update(temp_vectorizer.get_feature_names_out())
        # fit 호출 마다 vectorizer 업데이트
        self.vectorizer = self.get_vectorizer()
    
    def predict_proba(self, code1, code2):
        # 입력 받은 코드 쌍으로 부터 vectorizer를 통해 vector화 합니다.
        code1_vecs = self.vectorizer.transform(code1)
        code2_vecs = self.vectorizer.transform(code2)
        
        preds = []
        # 각각의 코드 쌍(=벡터 쌍)으로부터 cosine-similarity를 구합니다.
        for code1_vec, code2_vec in tqdm(zip(code1_vecs, code2_vecs)):
            preds.append(cosine_similarity(code1_vec, code2_vec))
        
        preds = np.reshape(preds, len(preds))
        print('Done.')
        # 각 코드 쌍들의 유사도를 반환
        return preds
    
    def predict(self, code1, code2):
        preds = self.predict_proba(code1, code2)
        # cosine-similarity (유사도)가 설정한 임계값(Threshold=0.5)보다 높다면 유사하다 : 1, 아니라면 유사하지 않다 : 0
        preds = np.where(preds>self.threshold, 1, 0)
        # 각 코드 쌍들의 유사도를 Threshold를 통해 유사함을 판별 (이진분류)
        return preds

In [27]:
def clean_extra_newlines(s):
    """
    1) strip right spaces
    2) leave left spaces as is because this is indentation
    3) remove duplicate newlines
    """
    lines = s.split('\n')
    clean = []
    for line in lines:
        if line.strip() == '':  
            continue
        line = line.rstrip()
        clean.append(line)
    return '\n'.join(clean)


def clean_singleline_comments(s):
    """
    Remove sharp-leading comments from the beginning and any other place
    """
    lines = s.split('\n')
    clean = []
    for line in lines:
        if line.lstrip().startswith('//'):
            continue
        if '//' in line:
            line = line[:line.index('//')].rstrip()
        clean.append(line)
    return '\n'.join(clean)

def clean_multiline_comments(s):
    """
    Remove all strings enclosed in triple quotes:
    1) file-level doc-string
    2) class, function doc-strings
    3) any multiline string and related variable e.g. "x = '''hello'''" is completely removed
    """
    for q in ['/*', '*/']:        
        if q not in s:
            continue
        print(1)
        lines = s.split('\n')
        cleaned = []
        flag = False
        for line in lines:
            if line.count(q) == 2:
                continue
            if line.count(q) == 1:
                flag = not flag
                continue
            if flag:
                continue
            cleaned.append(line)
        s = '\n'.join(cleaned)
    return s

def clean_lex(s):
    """Remove #include and `using namespace`"""
    lines = s.split('\n')
    clean = []
    for line in lines:
        if '#include' in line:
            continue
        # if 'if __name__' in line:
            # continue
        if 'using namespace' in line:
            continue
        clean.append(line)    
    return '\n'.join(clean)

def clean_indents(s):
    """
    Remove all types of spaces
    """
    return ' '.join(s.split())

def preproc(s):
    """Apply all cleaning functions"""
    s = clean_extra_newlines(s)
    s = clean_singleline_comments(s)
    s = clean_multiline_comments(s)
    s = clean_lex(s)
    s = clean_indents(s)
    return s

## Model(Vectorizer) Fit

In [8]:
train_code_paths = glob.glob('./data/train_code/*/*.cpp')

In [9]:
def read_cpp_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [None]:
print(train_code_paths[0])
preproc(read_cpp_code(train_code_paths[0]))


In [10]:
# 모델 선언
model = BaselineModel(threshold=0.5)

In [11]:
for path in tqdm(train_code_paths):
    code = read_cpp_code(path)
    preproc_code = preproc(code)
    model.fit([code])

  2%|▏         | 5424/250000 [00:05<04:14, 959.52it/s] 


KeyboardInterrupt: 

In [None]:
len(model.vocabulary)

113727

## Validation

In [None]:
def get_accuracy(gt, preds):
    return (gt == preds).mean()

In [None]:
val_preds = model.predict(preproc(val['code1']), preproc(val['code2']))

20000it [00:23, 855.42it/s]


Done.


In [None]:
print(get_accuracy(val['similar'].values, val_preds))

0.60125


## Inference

In [None]:
# 모델 추론
preds = model.predict(preproc(test['code1']), preproc(test['code2']))

595000it [11:20, 874.85it/s]


Done.


## Submission

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['similar'] = preds
submission.to_csv('./result/preproc_submit.csv', index=False)