In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%run utils.ipynb
%run model.ipynb
%run feature_engineering.ipynb

In [3]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.utils import class_weight
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [4]:
def train_model(train_data_dict, train_labels, inter_path):

    model_A = XGBClassifier(
        objective='multi:softprob',
        num_class=np.unique(train_labels).shape[0],
        max_depth=6,
        n_estimators=90,
        learning_rate=0.1,
        eval_metric='mlogloss',
        use_label_encoder=False
    )

    classes_weights = class_weight.compute_sample_weight(
        class_weight='balanced',
        y=train_labels
    )

    labels_loss = pd.DataFrame()
    print(f"------------------------ Training ------------------------")
    for name, train_data in train_data_dict.items():

        if name in ['words_1000', 'ins_1000', 'ember_section_ins_words', 'ember_section_ins_semantic']:
            # Implement feature selection for tf-idf features
            selector = SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=200))
            train_data[np.isnan(train_data)] = 0.0
            selector.fit(train_data, train_labels, sample_weight=classes_weights)

            joblib.dump(selector, open(f"{inter_path}/models/select_model_{name}.pth", "wb"))
            train_data = selector.transform(train_data)

        clf = Model(model_A, train_data, train_labels, name, inter_path, labels_loss)
        
        print(name)
        %time clf.Fit()
        
        labels_loss[name] = clf.get_class_weight()

    labels_loss[np.isnan(labels_loss)] = 0
    labels_loss[labels_loss < 0] = 0
    labels_loss.to_csv(f"{inter_path}/feature/labels_loss.csv", index=False)