## Import

In [1]:
from module.prerequisite import install_requirements, download_data
install_requirements()
download_data()

In [2]:
import pandas as pd
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

## Load Train / Test dataset

In [74]:
# 추가로 주어지는 Python Code 데이터들을 활용하여 새로운 Pair 쌍을 생성하여 더 많은 데이터로 학습할 수 있습니다.
# 베이스라인에서는 이미 Pair 쌍으로 구축되어 주어지는 sample 데이터들로 검증을 진행합니다.
val = pd.read_csv("./data/sample_train.csv")
val.head()

Unnamed: 0,code1_path,code2_path,code1,code2,similar
0,./train_code/problem393/problem393_19.cpp,./train_code/problem033/problem033_439.cpp,#include <bits/stdc++.h>\n\nusing namespace st...,#include <algorithm>\n#include <bitset>\n#incl...,0
1,./train_code/problem019/problem019_210.cpp,./train_code/problem019/problem019_63.cpp,#include <iostream>\n\nusing namespace std;\n\...,#include <iostream>\n#include <string>\nusing ...,1
2,./train_code/problem107/problem107_486.cpp,./train_code/problem107/problem107_340.cpp,#include <iostream>\n#include <vector>\nusing ...,#include <cstdio>\n#include <cstdlib>\n#includ...,1
3,./train_code/problem187/problem187_257.cpp,./train_code/problem403/problem403_135.cpp,#include <bits/stdc++.h>\n#include <unordered_...,#include <bits/stdc++.h>\nusing namespace std;...,0
4,./train_code/problem173/problem173_490.cpp,./train_code/problem173/problem173_345.cpp,#include <bits/stdc++.h>\ntypedef long long ll...,"#include ""bits/stdc++.h""\n#define rep(i,n) for...",1


In [4]:
test = pd.read_csv("./data/test.csv")
test.head()

Unnamed: 0,pair_id,code1,code2
0,TEST_000000,#include <bits/stdc++.h>\nusing namespace std;...,"#include <bits/stdc++.h>\n#define rep(i, n) fo..."
1,TEST_000001,"#include<bits/stdc++.h>\n#define rep(i,n)for(i...",// //bitset操作\n// #include <iostream>\n// #inc...
2,TEST_000002,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\n#include <ext/pb_ds/...
3,TEST_000003,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\nusing namespace std;...
4,TEST_000004,#include<bits/stdc++.h>\nusing namespace std;\...,#include<iostream>\n#include<algorithm>\n#incl...


## Define Model (CountVectorizer+CosineSimilarity)

In [118]:
class BaselineModel():
    def __init__(self, threshold=0.5):
        super(BaselineModel, self).__init__()
        self.threshold = threshold # 유사도 임계값
        self.vocabulary = set()

    def get_vectorizer(self):
        return CountVectorizer(vocabulary=list(self.vocabulary))
        
    def fit(self, code, path):
        try:
            # 입력 받은 학습 코드로 부터 vectorizer를 fit 시킵니다.
            temp_vectorizer = CountVectorizer()
    
            # 그 외 예외 처리
            temp_vectorizer.fit(code)
            # fit 호출 마다 vectorizer에 활용할 vocabulary 업데이트
            self.vocabulary.update(temp_vectorizer.get_feature_names_out())
            # fit 호출 마다 vectorizer 업데이트
            self.vectorizer = self.get_vectorizer()
        except Exception as e:
            print(code, path)
    
    def predict_proba(self, code1, code2):
        # 입력 받은 코드 쌍으로 부터 vectorizer를 통해 vector화 합니다.
        code1_vecs = self.vectorizer.transform(code1)
        code2_vecs = self.vectorizer.transform(code2)
        
        preds = []
        # 각각의 코드 쌍(=벡터 쌍)으로부터 cosine-similarity를 구합니다.
        for code1_vec, code2_vec in tqdm(zip(code1_vecs, code2_vecs)):
            preds.append(cosine_similarity(code1_vec, code2_vec))
        
        preds = np.reshape(preds, len(preds))
        print('Done.')
        # 각 코드 쌍들의 유사도를 반환
        return preds
    
    def predict(self, code1, code2):
        preds = self.predict_proba(code1, code2)
        # cosine-similarity (유사도)가 설정한 임계값(Threshold=0.5)보다 높다면 유사하다 : 1, 아니라면 유사하지 않다 : 0
        preds = np.where(preds>self.threshold, 1, 0)
        # 각 코드 쌍들의 유사도를 Threshold를 통해 유사함을 판별 (이진분류)
        return preds

In [114]:
def remove_semicolon(s):
    """
    Replace all semicolon to \n
    """
    s = s.replace(';','\n')
    return s

def remove_type(s):
    s = s.replace('void', '')
    s = s.replace('const', '')
    return s

def clean_comments(s):
    """
    Remove sharp-leading comments from the beginning and any other place
    """
    lines = s.split('\n')
    flag = False
    clean = []
    for line in lines:

        # /* is procceeding
        if flag:
            if '*/' not in line:
                continue
            line = line[line.index('*/')+2:].rstrip()
            flag = False

        if line.lstrip().startswith('//'):
            continue

        if line.lstrip().startswith('/*'):
            if '*/' not in line:
                flag = True
            continue

        if '//' in line:
            line = line[:line.index('//')].rstrip()

        clean.append(line)
    return '\n'.join(clean)

def clean_lex(s):
    """Remove `using namespace`"""
    lines = s.split('\n')
    clean = []
    for line in lines:
        if 'using namespace' in line:
            continue
        clean.append(line)    
    return '\n'.join(clean)

def clean_indents(s):
    """
    Replace all types of spaces
    """
    return ' '.join(s.split())

def preproc(s):
    """Apply all cleaning functions"""
    s = remove_semicolon(s)
    s = remove_type(s)
    s = clean_comments(s)
    s = clean_lex(s)
    s = clean_indents(s)
    return s

## Model(Vectorizer) Fit

In [115]:
train_code_paths = glob.glob('./data/train_code/*/*.cpp')

In [116]:
def read_cpp_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [104]:
def preproc(s):
    """Apply all cleaning functions"""
    s = remove_semicolon(s)
    s = remove_type(s)
    s = clean_comments(s)
    s = clean_lex(s)
    s = clean_indents(s)
    return s
preproc(read_cpp_code("./data/train_code/problem101/problem101_73.cpp"))

'#include<bits/stdc++.h> typedef long long ll typedef unsigned long long ull typedef vector<int> vi typedef vector<ll> vl typedef pair<int, int> pii typedef pair<ll, ll> pll typedef pair<ll, int> pli typedef pair<int, ll> pil typedef vector<pii> vii typedef vector<pil> vil typedef vector<pli> vli typedef vector<pll> vll #define ff first #define ss second #define pb push_back #define mp make_pair #define sz size() #define all(a) a.begin(), a.end() #define mem(a, b) memset(a, b, sizeof(a)) #define f0(i,n) for(ll i=0 i<(n) i++) #define f1(i,n) for(ll i=1 i<=(n) i++) #define f2(i,a,n) for(ll i=(a) i<=(n) i++) #define fr(i,n,a) for(ll i=(n) i>=(a) i--) #define rep(i,a,b,c) for(ll i=(a) i!=(b) i+=(c)) #define nl "\\n" int INF = 1e9 + 5 int MXN = 2e5 + 5 int MOD = 1e9 + 7 solve(){ int n cin >> n int a[n] f0(i, n) cin >> a[i] int mn = 0 ll tk = 1000, st = 0 f0(i, n-1){ if(a[i+1] > a[i]) continue else{ st = tk/a[mn] tk = tk-(st*a[mn])+(st*a[i]) mn = i+1 } } if(mn != n-1){ st = tk/a[mn] tk = tk-

In [158]:
# 모델 선언
model = BaselineModel(threshold=0.7)

In [159]:
for path in tqdm(train_code_paths):
    code = read_cpp_code(path)
    preproc_code = preproc(code)
    model.fit([preproc_code], path)

100%|██████████| 250000/250000 [07:39<00:00, 544.09it/s]


In [160]:
import pickle

# 사전을 파일로 저장
with open('./result/vocabulary_v3_0.7.pkl', 'wb') as f:
    pickle.dump(model.vocabulary, f)

In [161]:
len(model.vocabulary)

63987

## Validation

In [162]:
def get_accuracy(gt, preds):
    return (gt == preds).mean()

In [163]:
val_preds = model.predict(val['code1'].apply(preproc), val['code2'].apply(preproc))

20000it [00:21, 938.98it/s]

Done.





In [164]:
print(get_accuracy(val['similar'].values, val_preds))

0.5472


## Inference

In [77]:
# 모델 추론
preds = model.predict(test['code1'].apply(preproc), test['code2'].apply(preproc))

450920it [07:54, 949.73it/s]


KeyboardInterrupt: 

## Submission

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['similar'] = preds
submission.to_csv('./result/preproc_submit.csv', index=False)