In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
%run utils.ipynb
%run model.ipynb
%run feature_engineering.ipynb

In [2]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd

In [1]:
def submit_result(inter_path, result_np, result_name):
    """ Generate the final result file to be submit. """
    result_path = f"{inter_path}/prediction_result".replace('/user_data', '')
    with open(f"{inter_path}/test_filename.txt", 'r') as fp:
        test_filename = fp.read().split()
    result = pd.DataFrame()
    result['filename'] = test_filename
    fam_cols = ['family_' + str(i) for i in range(result_np.shape[1])]
    result[fam_cols] = result_np
    result.to_csv(f"{result_path}/{result_name}.csv", index=False)

In [6]:
def predict_result(feature_list, inter_path):
    """ Generate the weighted results by family """
    vote_list = []
    test_data_dict = load_data('test', feature_list, inter_path)
    for name, test_data in test_data_dict.items():
        if name in ['words_1000', 'ins_1000', 'ember_section_ins_words', 'ember_section_ins_semantic']:
            # Implement feature selection for tf-idf features
            selector = joblib.load(open(f"{inter_path}/models/select_model_{name}.pth", "rb"))
            test_data[np.isnan(test_data)] = 0.0
            test_data = selector.transform(test_data)

        clf = Model(label=name, inter_path=inter_path)
        y_test = clf.Predict(test_data)
        vote_list.append(y_test)

    labels_loss = pd.read_csv(f"{inter_path}/feature/labels_loss.csv")
    np.save(f"{inter_path}/vote_list_fu.npy", vote_list)
    vote_weight = vote_weight_results(labels_loss, vote_list, feature_list)  # Weighted results by family
    # vote = vote_results(vote_list)  # Average results by family
    return vote_weight

In [None]:
def predict_result_base(feature_list, inter_path):
    
    test_data_dict = load_data('test', feature_list, inter_path)
    print("------------------------ Prediction ------------------------")
    for name, test_data in test_data_dict.items():
        
        if name in ['words_1000', 'ins_1000', 'ember_section_ins_words', 'ember_section_ins_semantic']:
            # Implement feature selection for tf-idf features
            selector = joblib.load(open(f"{inter_path}/models/select_model_{name}.pth", "rb"))
            test_data[np.isnan(test_data)] = 0.0
            test_data = selector.transform(test_data)

        clf = Model(label=name, inter_path=inter_path)
        
        t1 = time.time()
        y_test = clf.Predict(test_data)
        t2 = time.time()
        print(name)
        print(f"Wall time: {t2 - t1} s")
        
        submit_result(inter_path, y_test, f"result_{name}")

In [1]:
def predict_result_vote(feature_list, inter_path):
    
    vote_list = []
    test_data_dict = load_data('test', feature_list, inter_path)
    print("------------------------ Prediction (vote) ------------------------")
    for name, test_data in test_data_dict.items():
        if name in ['words_1000', 'ins_1000', 'ember_section_ins_words', 'ember_section_ins_semantic']:
            # Implement feature selection for tf-idf features
            selector = joblib.load(open(f"{inter_path}/models/select_model_{name}.pth", "rb"))
            test_data[np.isnan(test_data)] = 0.0
            test_data = selector.transform(test_data)
        clf = Model(label=name, inter_path=inter_path)
        y_test = clf.Predict(test_data)
        vote_list.append(y_test)

    np.save(f"{inter_path}/vote_list_base.npy", vote_list)
    vote = vote_results(vote_list)  # Average results by family
    submit_result(inter_path, vote, "result_vote")

In [2]:
def predict_result_weighted(feature_list, inter_path):
    print("------------------------ Prediction (weighted) ------------------------")
    labels_loss = pd.read_csv(f"{inter_path}/feature/labels_loss.csv")
    vote_list = np.load(f"{inter_path}/vote_list_base.npy", allow_pickle=True)
    vote_weight = vote_weight_results(labels_loss, vote_list, feature_list)  # Weighted results by family
    submit_result(inter_path, vote_weight, "result_weighted")