In [8]:
import pickle
import sys

import pandas as pd
import xgboost as xgb

sys.path.append('../src/')
from config import *
from preprocessing import *

In [2]:
data = pd.read_csv(f'../{test_file}')
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
class Preprocessor:
    def __init__(self):
        with open(f'../{tfidf_vectorizer_model}', 'rb') as file:
            self.tfidf_vectorizer = pickle.load(file)
        with open(f'../{selector_model}', 'rb') as file:
            self.selector = pickle.load(file)
        with open(f'../{pca_model}', 'rb') as file:
            self.pca = pickle.load(file)
        with open(f'../{scaler_model}', 'rb') as file:
            self.scaler = pickle.load(file)
        with open(f'../{kmeans_model}', 'rb') as file:
            self.kemans = pickle.load(file)
        self.clean_text = clean_text
        self.vectorize = vectorize
        self.select_features = select_features
        self.reduce_dimentions = reduce_dimentions
        self.scale = scale
        self.predict_kmeans = predict_kmeans


    def transform(self, text):
        # Clean text
        text = self.clean_text(text)
        # Vectorize
        tfidf = self.vectorize(np.array([text]), self.tfidf_vectorizer)
        # Feature selection
        if use_feature_selection:
            tfidf, _, _ = self.select_features(tfidf, self.tfidf_vectorizer.get_feature_names_out(), self.selector)
        # PCA
        tfidf = self.reduce_dimentions(tfidf, self.pca)
        # Scale [0, 1]
        tfidf = self.scale(tfidf, self.scaler)
        # K-Means clusterization
        tfidf = self.predict_kmeans(tfidf, self.kemans)


    def transform_array(self, text):
        # Clean text
        text = text.apply(self.clean_text)
        # Vectorize
        tfidf = self.vectorize(text, self.tfidf_vectorizer)
        # Feature selection
        if use_feature_selection:
            tfidf, _, _ = self.select_features(tfidf, self.tfidf_vectorizer.get_feature_names_out(), self.selector)
        # PCA
        tfidf = self.reduce_dimentions(tfidf, self.pca)
        # Scale [0, 1]
        tfidf = self.scale(tfidf, self.scaler)
        # K-Means clusterization
        tfidf = self.predict_kmeans(tfidf, self.kemans)

        return tfidf

In [4]:
preprocessor = Preprocessor()
X_tfidf = preprocessor.transform_array(data['comment_text'])

with open(f'../{preprocessor_model}', 'wb') as file:
    pickle.dump(preprocessor, file)

In [5]:
model = xgb.Booster()
model.load_model(f'../{xgb_model}-v{xgb_model_version}.json')

dtest = xgb.DMatrix(X_tfidf)
y_pred = model.predict(dtest)

y_pred = pd.DataFrame(y_pred, columns=label_names)
submission_df = pd.concat([data[['id']], y_pred], axis=1)

submission_df.to_csv(f'../{submission_file}', index=False)

In [6]:
submission_df.shape

(63978, 7)

In [7]:

submission_df.head(5)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0.056288,0.003286,0.014745,0.00011,0.053132,0.001079
1,000247e83dcc1211,0.287195,0.00065,0.029591,0.000443,0.05788,0.00135
2,0002f87b16116a7f,0.039273,0.000379,0.012417,0.000143,0.007752,0.003941
3,0003e1cccfd5a40a,0.009487,9.6e-05,0.003012,1.8e-05,0.002366,0.000519
4,00059ace3e3e9a53,0.001576,4.4e-05,0.000839,4.2e-05,0.0025,0.000158
