In [44]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from copy import deepcopy
# Data manipulation libraries
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,RepeatedStratifiedKFold,GridSearchCV
# Model evaluaion libraries
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, classification_report, f1_score, make_scorer

# Classiication Models libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,plot_importance
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from navec import Navec
from sklearn.tree import DecisionTreeClassifier

# Remove warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

RANDOM_STATE = 42
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

In [37]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [51]:
def get_df(razmetka: pd.Series):
    data = pd.DataFrame(data = {'слова' : razmetka.apply(lambda x: x.keys()).explode(0).values,
                    'метки' : razmetka.apply(lambda x: x.values()).explode().values})
    data = data[data.метки != '-'][data.метки != '1']
    return data

In [52]:
def prepr(data : pd.DataFrame):
    data.метки = data.метки.apply(lambda x: 'бренд' if x in ['бред'] else x)
    data.метки = data.метки.apply(lambda x: 'наз' if x in ['назн'] else x)
    data.метки = data.метки.apply(lambda x: 'раз' if x in ['размер', 'разм'] else x)
    data.метки = data.метки.apply(lambda x: 'тов' if x in ['товар'] else x)
    data.метки = data.метки.apply(lambda x: 'мат' if x in ['материал'] else x)
    data.метки = data.метки.apply(lambda x: 'мод' if x in ['модель'] else x)

In [53]:
def get_features(X):
    
    for i in range(len(X.iloc[0, 0])):
        X[f'feat_{i+1}'] = X.Vec.apply(lambda x: x[i])
    X = X.drop('Vec', axis = 1)
    return X

In [54]:
data = pd.read_json('data/supl_biz_157k.json')
preprocessed = pd.read_json('razmetka/01_05.json')
dataset = preprocessed[preprocessed.Razme4.notna()].Razme4
dataset = get_df(dataset)
prepr(dataset)
dataset = dataset[dataset.метки != 'цвет']
dataset['Vec'] = dataset.слова.apply(lambda x: navec.get(x))
dataset = dataset[dataset.Vec.notna()]
encoder = LabelEncoder()
y = encoder.fit_transform(dataset.метки)
X = dataset[['Vec']]
X = get_features(X)

In [55]:
assert X.values.shape[0]  == y.ravel().shape[0]

In [56]:
class Splitter:
    def __init__(self, kfold = True, n_repeats = None, n_splits = 5, shuffle = True):
        self.kfold = kfold
        self.n_repeats = n_repeats
        self.n_splits = n_splits
        self.shuffle = shuffle
        
    def split_data(self, X, y):
        if self.n_repeats:
            kf = RepeatedStratifiedKFold(n_repeats = self.n_repeats, n_splits = self.n_splits, shuffle = self.shuffle)
        
        else:
            kf = StratifiedKFold(n_splits = self.n_splits, shuffle = self.shuffle)
        
        for train_idx, val_idx in kf.split(X, y):
            X_train, X_val = X[train_idx, :], X[val_idx, :]
            y_train, y_val = y[train_idx], y[val_idx]
            yield X_train, X_val, y_train, y_val
    

In [57]:
class models_summary():
    def __init__(self, classifiers =  {'SVC':SVC(), 
                            'RF':RandomForestClassifier(), 
                            'CatBoost':CatBoostClassifier(silent=True), 
                            'LogReg':LogisticRegression()}):
        self.classifiers = classifiers
        self.cv_results = dict(zip(list(self.classifiers.keys()), [None] * len(self.classifiers))) 
         
        self.holdout_results = dict(zip(list(self.classifiers.keys()), [None] * len(self.classifiers)))
                
        
    def fit(self, X_ : np.array, y_ : np.array, holdout_size = 0.2, random_state = None, n_splits = 5):
        self.n_splits = n_splits
        self.holdout_size = holdout_size
        self.X, self.X_holdout, self.y, self.y_holdout = train_test_split(X_, y_,\
                                                                          test_size=self.holdout_size, stratify = y_)

        for classifier_name in list(self.classifiers.keys()):
            splitter = Splitter(n_splits=self.n_splits)
            model = self.classifiers[classifier_name]
            self.cv_results[classifier_name] = []
            for X_train, X_val, y_train, y_val in splitter.split_data(self.X, self.y):
                model.fit(X_train, y_train)
                score = model.score(X_val, y_val)
                self.cv_results[classifier_name].append(score)

            holdout_score = model.score(self.X_holdout, self.y_holdout)
            self.holdout_results[classifier_name] = holdout_score

            print(f'{classifier_name} fitted,\nHoldout Result = {holdout_score}')
        return pd.DataFrame({'model_name' : list(self.classifiers.keys()),
                            'holdout_score' : list(self.holdout_results.values())})

        
        
        
        

In [58]:
summary = models_summary()
df = summary.fit(X.values, y, n_splits = 4)

SVC fitted,
Holdout Result = 0.6630434782608695
RF fitted,
Holdout Result = 0.6630434782608695
CatBoost fitted,
Holdout Result = 0.7282608695652174
LogReg fitted,
Holdout Result = 0.6956521739130435


## Split Data

In [59]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size = 0.2)

## Grids

In [60]:
RF_grid = { 
    'n_estimators': [200, 300],
    'max_features': ['sqrt'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CatBoost_grid = {'depth'         : [4,5,6,7,8],
                 'learning_rate' : [0.01,0.02,0.03,0.04],
                  'iterations'    : [10, 20,30,40,50,60,70]
                 }

logreg_grid = {'penalty' : ['l2', 'l1'],
               'C' : [0.5, 0.8, 1]
}

DecisionTree_grid = {'max_depth' : [2,4,6,8,10,12]}


## Decision Tree 

In [14]:
clf = DecisionTreeClassifier()

In [15]:
f1_weighted = make_scorer(f1_score, average = 'weighted')

In [16]:
grid = {
    'max_depth' : range(2, 11, 2),
    'min_samples_split' : range(2,7,2)
}
gridsearch = GridSearchCV(estimator = clf, param_grid = grid, cv = 5,\
                          scoring = f1_weighted)

In [17]:
gridsearch.fit(X_train, y_train)

In [18]:
best_tree = gridsearch.best_estimator_
best_tree.fit(X_train, y_train)
pred = best_tree.predict(X_holdout)
print(f1_score(pred, y_holdout, average = 'weighted'),'\n', classification_report(pred, y_holdout))

0.7247897714396712 
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       0.00      0.00      0.00         1
           2       1.00      1.00      1.00         1
           3       1.00      0.78      0.88         9
           4       0.25      0.50      0.33         2
           5       0.69      0.69      0.69        13
           6       0.71      0.71      0.71         7
           7       0.60      0.47      0.53        19
           8       0.80      0.87      0.84        38

    accuracy                           0.73        92
   macro avg       0.62      0.67      0.63        92
weighted avg       0.73      0.73      0.72        92



# GridSearch

In [61]:
estimators = {RandomForestClassifier() : [RF_grid, 'RF'], 
#              CatBoostClassifier() : CatBoost_grid, 
              LogisticRegression() : [logreg_grid, 'LogReg'],
            DecisionTreeClassifier() : [DecisionTree_grid, 'Tree']}


In [62]:
names = ['RF', 'LogReg', 'Tree']

## KFold

In [63]:
cv = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)

In [64]:
for estimator, grid in list(estimators.items()):
    gridsearch = GridSearchCV(estimator=estimator, param_grid=grid[:-1], scoring=f1_weighted, cv=cv)
    gridsearch.fit(X_train, y_train)
    print(f'{grid[-1]} Fitted\nBest Score:{gridsearch.best_score_}')

    
    pd.DataFrame(gridsearch.cv_results_).to_json(f'cv_results/{grid[-1]}_results.json')
    print(f'File cv_results/{grid[-1]}_results.json written!')
    pd.DataFrame(gridsearch.best_params_, index = [1]).to_json(f'cv_results/{grid[-1]}_best_params.json')
    print(f'File cv_results/{grid[-1]}_best_params.json written!')
    print('-'*30)

RF Fitted
Best Score:0.73617809903969
File cv_results/RF_results.json written!
File cv_results/RF_best_params.json written!
------------------------------


12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "d:\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



LogReg Fitted
Best Score:0.7299734541661486
File cv_results/LogReg_results.json written!
File cv_results/LogReg_best_params.json written!
------------------------------
Tree Fitted
Best Score:0.6781865077721523
File cv_results/Tree_results.json written!
File cv_results/Tree_best_params.json written!
------------------------------


### Get DataFrame

In [65]:
pd.read_json('cv_results/Tree_results.json').loc[:, ['params', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score
0,{'max_depth': 2},0.46043,0.047852
1,{'max_depth': 4},0.587707,0.039475
2,{'max_depth': 6},0.650827,0.030535
3,{'max_depth': 8},0.669194,0.019183
4,{'max_depth': 10},0.678187,0.031377
5,{'max_depth': 12},0.676919,0.03742


In [66]:
pd.read_json('cv_results/RF_results.json').loc[:, ['params', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.664298,0.025472
1,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.64843,0.035233
2,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.688965,0.03935
3,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.688406,0.039078
4,"{'criterion': 'gini', 'max_depth': 6, 'max_fea...",0.721483,0.034358
5,"{'criterion': 'gini', 'max_depth': 6, 'max_fea...",0.709036,0.034892
6,"{'criterion': 'gini', 'max_depth': 7, 'max_fea...",0.725794,0.028028
7,"{'criterion': 'gini', 'max_depth': 7, 'max_fea...",0.718543,0.019441
8,"{'criterion': 'gini', 'max_depth': 8, 'max_fea...",0.719978,0.027871
9,"{'criterion': 'gini', 'max_depth': 8, 'max_fea...",0.717807,0.041629


### Best Models Perfomance on Holdout


In [153]:
best_params =  dict(zip(names,map(lambda x: pd.read_json(f'cv_results/{x}_best_params.json').to_dict(orient='records')[0], names)))

In [156]:
list(estimators.keys())

[RandomForestClassifier(), LogisticRegression(), DecisionTreeClassifier()]

In [160]:
estimators

{RandomForestClassifier(): [{'n_estimators': [200, 300],
   'max_features': ['sqrt'],
   'max_depth': [4, 5, 6, 7, 8],
   'criterion': ['gini', 'entropy']},
  'RF'],
 LogisticRegression(): [{'penalty': ['l2', 'l1'], 'C': [0.5, 0.8, 1]},
  'LogReg'],
 DecisionTreeClassifier(): [{'max_depth': [2, 4, 6, 8, 10, 12]}, 'Tree']}

In [157]:
models = dict(zip( list(estimators.keys())


In [158]:
map(models, lambda x:)

[RandomForestClassifier(), LogisticRegression(), DecisionTreeClassifier()]

In [154]:
best_RF = RandomForestClassifier(**best_params['RF'])
best_RF.fit(X_train, y_train)
pred = best_RF.predict(X_holdout)
f1_score(pred, y_holdout, average='weighted')

0.7452138207061481

In [159]:
best_params

{'RF': {'criterion': 'entropy',
  'max_depth': 7,
  'max_features': 'sqrt',
  'n_estimators': 200},
 'LogReg': {'C': 1, 'penalty': 'l2'},
 'Tree': {'max_depth': 10}}