In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/sj/dacon/code_similarity

/content/drive/MyDrive/sj/dacon/code_similarity


In [None]:
!pip install tensorflow_addons
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.19.0 typeguard-2.13.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
import glob
import numpy as np
import os
import pandas as pd
import random
import sys
import sklearn
import time
from tqdm import tqdm

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import tensorflow_addons as tfa
from transformers import AutoTokenizer, TFT5EncoderModel
from datasets import load_dataset, load_metric, Dataset
from itertools import combinations

In [None]:
MODEL_NAME = "Salesforce/codet5-base"
MAX_LEN = 256
BATCH_SIZE = 4

In [None]:
# 주석, 빈 줄, import문 제거
def remove_annotation(code):
    line_list = []

    single_quote = False
    double_quote = False
    for line in code.split("\n"):
        if single_quote:
            if "'''" not in line:
                continue

            single_quote = False
            line = line.split("'''")[1]
        elif double_quote:
            if '"""' not in line:
                continue

            double_quote = False
            line = line.split('"""')[1]

        if line.startswith("!"):
            continue
        elif line.startswith("import "):
            continue
        elif line.startswith("from "):
            continue

        annotation_idx = sys.maxsize
        single_quote_idx = sys.maxsize
        double_quote_idx = sys.maxsize

        try:
            annotation_idx = line.index("#")
        except:
            pass
        try:
            single_quote_idx = line.index("'''")
        except:
            pass
        try:
            double_quote_idx = line.index('"""')
        except:
            pass

        if annotation_idx < single_quote_idx and annotation_idx < double_quote_idx:
            no_annotation_line = line.split("#")[0]
            if no_annotation_line.strip() != "":
                line_list.append(no_annotation_line)
        elif single_quote_idx < annotation_idx and single_quote_idx < double_quote_idx:
            single_quote = True
            no_single_quote_line = line.split("'''")[0]
            if no_single_quote_line.strip() != "":
                line_list.append(no_single_quote_line)
        elif double_quote_idx < annotation_idx and double_quote_idx < single_quote_idx:
            double_quote = True
            no_double_quote_line = line.split('"""')[0]
            if no_double_quote_line.strip() != "":
                line_list.append(no_double_quote_line)
        else:
            if line.strip() != "":
                line_list.append(line)

    return "\n".join(line_list)

def preprocess_code(file_loc):
    line_list = []
    with open(file_loc, "r", encoding="utf-8") as file:
        code = file.read()
        modified_code = remove_annotation(code)

    return modified_code

def parse_code(problem_list, max_file_count=20, start_idx=0):
    code_list = []
    problem_idx = 0
    for problem_dir in tqdm(problem_list):
        python_files = glob.glob(f"{problem_dir}/*.py")
        file_count = 0
        for python_file in python_files[start_idx:]:
            code = preprocess_code(python_file)
            code_list.append((code, problem_idx))
            file_count = file_count + 1
            if file_count == max_file_count:
                break
        problem_idx += 1

    return pd.DataFrame(code_list, columns=["code", "problem_idx"])

def remove_invalid_code(df):
    df["tokens"] = df["code"].apply(tokenizer.tokenize)
    df["len"] = df["tokens"].apply(len)
    df = df[df["len"] > 4].reset_index(drop=True)

    return df

def train_test_split(df, split_rate=0.1):
    problem_list = df["problem_idx"].unique().tolist()
    valid_idx = random.sample(problem_list, int(len(problem_list) * split_rate))

    train_df = df[~df["problem_idx"].isin(valid_idx)]
    valid_df = df[df["problem_idx"].isin(valid_idx)]

    return train_df, valid_df

def random_combinations(list1, list2, count):
    total_group = set()
    while len(total_group) < count:
        item1 = list1[random.randrange(0, len(list1))]
        item2 = list2[random.randrange(0, len(list2))]

        total_group.add((item1, item2))

    return total_group

def generate_pairs(df, sample_rate=0.01):
    total_positive_pairs = []
    total_negative_pairs = []

    total_data = []
    problem_list = df["problem_idx"].unique().tolist()
    for problem_idx in tqdm(problem_list):
        solution_codes = df[df["problem_idx"] == problem_idx]["code"]
        negative_codes = df[df["problem_idx"] != problem_idx]["code"]
        positive_pairs = list(combinations(solution_codes.to_list(), 2))
        sampled_positive_pairs = random.sample(positive_pairs, len(positive_pairs) // int(1 / sample_rate))

        negative_pairs = random_combinations(solution_codes.to_list(), negative_codes.to_list(), len(sampled_positive_pairs))
        
        true_data = [(x[0], x[1], 1) for x in sampled_positive_pairs]
        false_data = [(x[0], x[1], 0) for x in negative_pairs]

        total_pairs = true_data + false_data
        total_data.extend(total_pairs)

    pair_df = pd.DataFrame(total_data, columns=["code1", "code2", "similar"])
    pair_df = sklearn.utils.shuffle(pair_df)

    return pair_df

def generate_datasets(max_file_count, start_idx, train_sr=0.2, valid_sr=0.1):
    random.seed(42)

    dataset_dirs = ["code"]

    total_problem_dirs = []
    for dataset_dir in dataset_dirs:
        problem_dirs = glob.glob(f"{dataset_dir}/*")
        total_problem_dirs.extend(problem_dirs)

    df = parse_code(total_problem_dirs, max_file_count=max_file_count, start_idx=start_idx)
    #df = remove_invalid_code(df)
    train_df, valid_df = train_test_split(df, split_rate=0.1)

    train_pair_df = generate_pairs(train_df, sample_rate=train_sr)
    valid_pair_df = generate_pairs(valid_df, sample_rate=valid_sr)

    return train_pair_df, valid_pair_df

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_data(examples):
    code1_output = tokenizer(examples['code1'], padding=True, max_length=MAX_LEN, truncation=True)
    code2_output = tokenizer(examples['code2'], padding=True, max_length=MAX_LEN, truncation=True)

    examples["code1_input_ids"] = code1_output["input_ids"]
    examples["code1_attention_mask"] = code1_output["attention_mask"]

    examples["code2_input_ids"] = code2_output["input_ids"]
    examples["code2_attention_mask"] = code2_output["attention_mask"]

    return examples

train_df, valid_df = generate_datasets(150, 0, train_sr=0.1, valid_sr=0.01)

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

train_dataset = train_dataset.map(preprocess_data, remove_columns=["code1", "code2"])
valid_dataset = valid_dataset.map(preprocess_data, remove_columns=["code1", "code2"])

def convert_to_tensorflow_dataset(dataset):
    dataset.set_format(type='tensorflow', columns=["code1_input_ids", "code1_attention_mask", "code2_input_ids", "code2_attention_mask", "similar"])
    features = {x: dataset[x].to_tensor(default_value=0, shape=[None, MAX_LEN]) for x in ["code1_input_ids", "code1_attention_mask", "code2_input_ids", "code2_attention_mask"]}
    labels = tf.keras.utils.to_categorical(dataset["similar"])

    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.batch(BATCH_SIZE)

    return dataset

train_ds = convert_to_tensorflow_dataset(train_dataset)
val_ds = convert_to_tensorflow_dataset(valid_dataset)

 37%|███▋      | 110/300 [03:50<06:38,  2.10s/it]


KeyboardInterrupt: ignored

In [None]:
callbacks = []

CHECKPOINT_DIR_PREFIX = os.path.join("checkpoint", MODEL_NAME.replace("/", "-"))
os.makedirs(CHECKPOINT_DIR_PREFIX, exist_ok=True)

latest_checkpoint_path = os.path.join(CHECKPOINT_DIR_PREFIX, "latest.ckpt")
latest_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=latest_checkpoint_path,
    save_weights_only=True,
    save_freq="epoch"
)
callbacks.append(latest_checkpoint_callback)

current_time = time.strftime("%y%m%d-%H%M%S")
checkpoint_path = os.path.join(CHECKPOINT_DIR_PREFIX, current_time, "cp-{epoch:04d}.ckpt")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq="epoch"
)
callbacks.append(checkpoint_callback)

log_path = os.path.join(CHECKPOINT_DIR_PREFIX, f"{MODEL_NAME.replace('/', '-')}-{current_time}.csv")
logger_callback = tf.keras.callbacks.CSVLogger(
    log_path, separator=',', append=True
)
callbacks.append(logger_callback)

earlystop_callback = tf.keras.callbacks.EarlyStopping(
    verbose=1,
    patience=4,
    restore_best_weights=True
)
callbacks.append(earlystop_callback)

auto_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.8,
    patience=2,
    verbose=1,
    min_lr=1e-9
)
callbacks.append(auto_lr_callback)

In [None]:
# TFRobertaForSequenceClassification 모델에서 가져옴
class ClassificationHead(tf.keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, output_count, **kwargs):
        super().__init__(**kwargs)
        self.dense = tf.keras.layers.Dense(
            768,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            activation="tanh",
            name="dense",
        )
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.out_proj = tf.keras.layers.Dense(
            output_count, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="out_proj"
        )

    def call(self, features, training=False):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x, training=training)
        x = self.dense(x)
        x = self.dropout(x, training=training)
        x = self.out_proj(x)

        return x

In [None]:
code1_input_ids_shape = (256)
code1_input_ids_input = tf.keras.Input(
    shape=code1_input_ids_shape, dtype=tf.int32, name="code1_input_ids"
)

code1_attention_mask_shape = (256)
code1_attention_mask_input = tf.keras.Input(
    shape=code1_attention_mask_shape, dtype=tf.int64, name="code1_attention_mask"
)

code2_input_ids_shape = (256)
code2_input_ids_input = tf.keras.Input(
    shape=code2_input_ids_shape, dtype=tf.int32, name="code2_input_ids"
)

code2_attention_mask_shape = (256)
code2_attention_mask_input = tf.keras.Input(
    shape=code2_attention_mask_shape, dtype=tf.int64, name="code2_attention_mask"
)

pretrained_model = TFT5EncoderModel.from_pretrained(MODEL_NAME, from_pt=True)
classification_head = ClassificationHead(output_count=2)

input_ids_input = tf.keras.layers.Concatenate()([code1_input_ids_input, code2_input_ids_input])
attention_mask_input = tf.keras.layers.Concatenate()([code1_attention_mask_input, code2_attention_mask_input])
code_output = pretrained_model(input_ids=input_ids_input, attention_mask=attention_mask_input)[0]
classification_output = classification_head(code_output)
dense_softmax = tf.keras.layers.Activation("softmax")(classification_output)

model = tf.keras.Model(
    inputs=[code1_input_ids_input, code1_attention_mask_input, code2_input_ids_input, code2_attention_mask_input],
    outputs=dense_softmax,
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=["accuracy", tfa.metrics.F1Score(2)])

In [None]:
model.summary()

In [None]:
# 0.0842
history = model.fit(train_ds,
          validation_data=val_ds,
          callbacks=callbacks,
          epochs=100)

In [None]:
model.save_weights(latest_checkpoint_path)

In [None]:
model.load_weights(latest_checkpoint_path)

In [None]:
def test_preprocess_data(examples):
    code1 = remove_annotation(examples['code1'])
    code2 = remove_annotation(examples['code2'])

    code1_output = tokenizer(code1, padding=True, max_length=MAX_LEN, truncation=True)
    code2_output = tokenizer(code2, padding=True, max_length=MAX_LEN, truncation=True)

    examples["code1_input_ids"] = code1_output["input_ids"]
    examples["code1_attention_mask"] = code1_output["attention_mask"]

    examples["code2_input_ids"] = code2_output["input_ids"]
    examples["code2_attention_mask"] = code2_output["attention_mask"]

    return examples

def test_convert_to_tensorflow_dataset(dataset):
    dataset.set_format(type='tensorflow', columns=["code1_input_ids", "code1_attention_mask", "code2_input_ids", "code2_attention_mask"])
    features = {x: dataset[x].to_tensor(default_value=0, shape=[None, MAX_LEN]) for x in ["code1_input_ids", "code1_attention_mask", "code2_input_ids", "code2_attention_mask"]}

    dataset = tf.data.Dataset.from_tensor_slices((features))
    dataset = dataset.batch(BATCH_SIZE)

    return dataset

test_dataset = load_dataset("csv", data_files="test.csv")["train"]
test_dataset = test_dataset.map(test_preprocess_data, remove_columns=["code1", "code2"])
test_ds = test_convert_to_tensorflow_dataset(test_dataset)

preds = model.predict(test_ds)

submission = pd.read_csv('./sample_submission.csv')
submission['similar'] = np.argmax(preds, axis=-1)
submission.to_csv('./submission.csv', index=False)