# Regex Labeling

## Data Loading

In [None]:
import pandas as pd
import numpy as np
import re
import csv
import json
import argparse

In [None]:
GRAPH_VER = 6
DATASET_PATH = './data/code_blocks_clean.csv' ## CODE_COLUMN = 'code_block'

OUTPUT_DATASET_PATH = './data/code_blocks_regex_graph_v{}.csv'.format(GRAPH_VER)
CODE_COLUMN = 'code_block'
GRAPH_DIR = './graph/graph_v{}.txt'.format(GRAPH_VER)

In [None]:
df = pd.read_csv(DATASET_PATH, encoding='utf-8', sep='\t')#, error_bad_lines=False, sep=',')#quoting=csv.QUOTE_NONE,

In [None]:
# df = df[[CODE_COLUMN, 'tag']]

In [None]:
print(df[CODE_COLUMN].isna().sum())
df = df.dropna(subset=[CODE_COLUMN]).reset_index()

In [None]:
# def wordListToFreqDict(wordlist):
#     def sortFreqDict(freqdict):
#         aux = [(freqdict[key], key) for key in freqdict]
#         aux.sort()
#         aux.reverse()
#         return aux
#     wordfreq = [wordlist.count(p) for p in wordlist]
#     return sortFreqDict(dict(list(zip(wordlist,wordfreq))))

In [None]:
# tokens = (" ".join(" ".join(df['Code'].to_list()).split('\n')).split('.'))#.split('')
# wordListToFreqDict(tokens)

In [None]:
# %%time
# wordListToFreqDict(df.code.to_string())

In [None]:
# %%time
# nl2ml = pd.read_csv(KK_PATH + 'nl2ml.csv')
# nl2ml = nl2ml.rename({'':'code_block', '':'method_tag'})
# nl2ml_vis = nl2ml[nl2ml['method_tag'] == 'Visualization']
# tokens_visualization = wordListToFreqDict(nl2ml_vis.code_block.to_string())

In [None]:
def tokens_search(df, tokens, new_column_name):
    df[new_column_name] = 0
    for i in range(len(df)):
        percents = str(round(100*i/len(df),1))
        print(percents + '%\r', end='')
        row = df[CODE_COLUMN][i]
        for token in tokens:
            result = re.search(token.replace('(','\('), row)
            if result!=None:
                df[new_column_name][i] = 1
                break
    return df

## Regex Labels

In [None]:
with open(GRAPH_DIR, "r") as graph_file:
    graph = json.load(graph_file)
    # graph = graph_file.read()

In [None]:
%%time
for i in range(0, len(graph)):
    vertice = list(graph.keys())[i]
    print('\n' + vertice)
    tokens = graph[vertice]
    df = tokens_search(df, tokens, vertice)

In [None]:
df['preprocessing'].value_counts()

In [None]:
df.to_csv(OUTPUT_DATASET_PATH, index=False)

In [None]:
# for col in list(graph.keys()):
#     display(df[df['Data format'] == 'Table'][col].value_counts())

In [None]:
print('finished')

## Regex Evaluation

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import dagshub

In [None]:
GRAPH_VER = 5
REGEXED_DATA_PATH = "./data/golden_884_set.csv"
TAGS = ['import', 'data_import', 'data_export', 'preprocessing',
                    'visualization', 'model', 'deep_learning_model', 'train', 'predict']
REGEX_TAGS = [el+'_regex_v{}'.format(GRAPH_VER) for el in TAGS]
regexed_data = pd.read_csv(REGEXED_DATA_PATH)
regexed_data.head()

In [None]:
Y_test, Y_pred = regexed_data[TAGS], regexed_data[REGEX_TAGS]

In [None]:
base_f1 = f1_score(Y_test, Y_pred, average='weighted')
base_precision = precision_score(Y_test, Y_pred, average='weighted')
base_recall = recall_score(Y_test, Y_pred, average='weighted')
regex_results = {'test_f1_score': base_f1
                , 'test_precision': base_precision
                , 'test_recall': base_recall}
regex_results

In [None]:
for i, tag in enumerate(TAGS):
    tag_results = (round(f1_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
                    round(precision_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
                    round(recall_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4))
    print(tag)
    print(tag_results)
    print('------')

In [None]:
data_meta = {'DATASET_PATH': REGEXED_DATA_PATH
            ,'nrows': regexed_data.shape[0]
            ,'graph_ver': GRAPH_VER
            ,'label': TAGS
            ,'model': 'regex_v{}'.format(GRAPH_VER)
            ,'script_dir': './regex.ipynb'
            ,'task': 'regex evaluation'}

with dagshub.dagshub_logger() as logger:
    logger.log_hyperparams(data_meta)
    logger.log_metrics(regex_results)