In [19]:
import dagshub
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from enum import Enum

from common.tools import *

import svm_for_semi as sfs

TARGET_COLUMN = 'graph_vertex_id'
FEATURE_COLUMN = 'code_block'
RANDOM_STATE = 42
GRAPH_VER = "7"


MODEL_DIR = "../models/hyper_noisy_regex_graph_v{}.sav".format(GRAPH_VER)
TFIDF_DIR = "../models/tfidf_hyper_noisy_graph_v{}.pickle".format(GRAPH_VER)
DATA_PATH = "../data/markup_data_2021-05-06.csv"
UNMARKED_DATA_PATH = "../data/not_yet_markup_data_2021-05-06.csv"
SEMI_ITER = 3

kfold_params = {
    "n_splits": 9,
    "random_state": RANDOM_STATE,
    "shuffle": True,
}

data = pd.read_csv(DATA_PATH)
unlabeled_data = pd.read_csv(UNMARKED_DATA_PATH)

class SemiType(Enum):
    RANDOM = 1
    MOST_ACCURATE = 2
    LEAST_ACCURATE = 3

def find_hyperparams(pseudo_df, data, kfold_params, TFIDF_DIR, MODEL_DIR, use_proba=False):
    return sfs.select_hyperparams(pseudo_df, data, kfold_params, TFIDF_DIR, MODEL_DIR, use_proba)

def get_idxs(data, rate, target=None, flag=False):
    idx = np.array(list(range(data.shape[0])))
    np.random.shuffle(idx)
    idx = idx[:int(data.shape[0] *rate)]
    
    new_blocks =  pd.DataFrame(data.iloc[idx][FEATURE_COLUMN], columns=[FEATURE_COLUMN])
    if flag:
        new_blocks[TARGET_COLUMN] = target[idx]
    else:
        new_blocks[TARGET_COLUMN] = data.iloc[idx][TARGET_COLUMN]
    return idx, new_blocks

def get_idxs_most_accurate(data, rate, target, target_proba, reverse=False):
    max_proba = target_proba.max(axis=1)
    args = (-max_proba).argsort()
    if reverse:
        args = (max_proba).argsort()
    idx = args[:int(data.shape[0] *rate)]
    
    new_blocks =  pd.DataFrame(data.iloc[idx][FEATURE_COLUMN], columns=[FEATURE_COLUMN])
    new_blocks[TARGET_COLUMN] = target[idx]
    return idx, new_blocks

def get_pseudo(data, unlabeled_data, best_tfidf_params, best_svm_params, use_proba=False):
    clf = SVC(**best_svm_params)
    code_blocks_tfidf = tfidf_fit_transform(data[FEATURE_COLUMN], best_tfidf_params, TFIDF_DIR)
    X, y = code_blocks_tfidf, data[TARGET_COLUMN].values
    
    clf.fit(X, y) 
    unl_tfidf = tfidf_transform(unlabeled_data[FEATURE_COLUMN], best_tfidf_params, TFIDF_DIR)
    if use_proba:
        return clf.predict(unl_tfidf), clf.predict_proba(unl_tfidf)
    return clf.predict(unl_tfidf)

def semi_baseline():
    real_idx, temp_pseudo = get_idxs(data, 0.001)
    train_data = data.drop(real_idx)
    return find_hyperparams(temp_pseudo, train_data, kfold_params, tfidf_path, model_path)

In [13]:
def semi_experiments(best_tfidf_params, best_svm_params, 
                data, unlabeled_data, 
                type_name = SemiType.RANDOM
                tfidf_dir_name, model_dir_name, metrics_path, params_path,
                tfidf_dir_base, model_dir_base,
                iter_number = 2, rate = 0.2):
    best_tfidf_params, best_svm_params, metrics = semi_baseline(tfidf_dir_base, model_dir_base)
    pseudo_idx = []
    pseudo_blocks = None
    for i in range(iter_number):
        print('Iteration ', i)
        print('create new pseudo targets')
        if type_name.name == RANDOM:
            pseudo_target = get_pseudo(data, unlabeled_data.drop(pseudo_idx), best_tfidf_params, best_svm_params)
            new_idx, new_blocks = get_idxs(unlabeled_data.drop(pseudo_idx), rate, pseudo_target, True)
        else if type_name.name == MOST_ACCURATE:
            pseudo_target, pseudo_target_proba = get_pseudo(data, unlabeled_data, best_tfidf_params, best_svm_params, True)
            new_idx, new_blocks = get_idxs_most_accurate(unlabeled_data, 0.2, pseudo_target, pseudo_target_proba)
        else:
            pseudo_target, pseudo_target_proba = get_pseudo(data, unlabeled_data, best_tfidf_params, best_svm_params, True)
            new_idx, new_blocks = get_idxs_most_accurate(unlabeled_data, 0.2, pseudo_target, pseudo_target_proba, True)
            
        pseudo_idx = np.append(pseudo_idx, new_idx)
        if pseudo_blocks != None:
            pseudo_blocks = pd.concat([pseudo_blocks, new_blocks])
        else:
            pseudo_blocks = new_blocks
        print('start hyperparam search')
        best_tfidf_params, best_svm_params, metrics = find_hyperparams(pseudo_blocks, 
                                                                       data, kfold_params, tfidf_dir_name, model_dir_name)
        print('finish search')
        print('Metrics are', metrics, '\n')
    
    kfold_params = {
    "n_splits": 9,
    "random_state": RANDOM_STATE,
    "shuffle": True,
    }
    data_meta = {
        "DATASET_PATH": DATA_PATH,
        "nrows": data.shape[0],
        "label": '',
        "model": model_dir_name,
        "script_dir": 'nl2ml/models_scripts/semi_experiment.ipynb',
    }

    with dagshub.dagshub_logger(metrics_path=metrics_path, hparams_path=params_path) as logger:
        print("logging the results")
        logger.log_hyperparams({"data": data_meta})
        logger.log_hyperparams({"tfidf": best_tfidf_params_least_accurate})
        logger.log_hyperparams({"model": best_svm_params_least_accurate})
        logger.log_hyperparams({"kfold": kf})
        logger.log_metrics(metrics_least_accurate_2)
    return best_tfidf_params, best_svm_params, metrics