In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.preprocessing import label_binarize
import numpy as np
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

In [73]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
df = pd.read_csv('Processed_data.csv')

In [75]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    doc = nlp(' '.join(filtered_tokens))
    lemmatized_tokens = ' '.join([token.lemma_ for token in doc])
    return lemmatized_tokens

In [76]:
df['qst_processed'] = df['question'].apply(preprocess_text)
df = df[~df['target'].isin(['unit 2', '?'])]
df_init = df.copy()

In [77]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['target'])
target = df['target']

df.drop(columns=['question', 'target', 'file', 'page', 'score'], inplace=True)

In [78]:
target

0      marketing mix and strategy
1       entrepreneurs and leaders
2                      the market
3       entrepreneurs and leaders
4                 managing people
                  ...            
262                    the market
263        meeting customer needs
266               managing people
267                    the market
269               managing people
Name: target, Length: 241, dtype: object

In [79]:
df

Unnamed: 0,qst_processed
0,define term brand
1,explain one risk jack may take set alibaba group
2,analyse two factor may increase demand alibaba...
3,discuss profit maximisation main business obje...
4,assess advantage paternalistic style leadershi...
...,...
262,assess two factor could influence price elasti...
263,assess whether likely target young market segm...
266,explain flexible workforce might benefit ssp
267,construct supply demand diagram illustrate lik...


In [80]:
X = df['qst_processed'].copy()

In [81]:
X

0                                      define term brand
1       explain one risk jack may take set alibaba group
2      analyse two factor may increase demand alibaba...
3      discuss profit maximisation main business obje...
4      assess advantage paternalistic style leadershi...
                             ...                        
262    assess two factor could influence price elasti...
263    assess whether likely target young market segm...
266         explain flexible workforce might benefit ssp
267    construct supply demand diagram illustrate lik...
269    assess likely benefit company ssp use techniqu...
Name: qst_processed, Length: 241, dtype: object

In [82]:
np.unique(y)

array([0, 1, 2, 3, 4])

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
from collections import defaultdict

metrics_test = defaultdict(list)

# CATBOOST

In [85]:
from time import time

catboost_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('catboost', CatBoostClassifier(loss_function='MultiClass',
                                    random_state=42, 
                                    silent=True)),
])

scoring = {
    'f1_macro': 'f1_macro',
    'roc_auc_ovr': 'roc_auc_ovr'
}

param_grid = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__min_df': [1, 2],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3),],
    'tfidf__max_df': [0.8],
    'catboost__depth': [3, 5, 7],
    'catboost__learning_rate': [0.01, 0.1, 0.3],
    'catboost__iterations': [100, 200],
    'catboost__l2_leaf_reg': [0.1, 1, 3, 5, 0.01, None],
}

grid_search = GridSearchCV(
    catboost_pipe,
    param_grid,
    cv=5,
    scoring='f1_macro',
    # refit='f1_macro',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

start_time = time()
grid_search.fit(X_train, y_train)

print(f'fitting time {time() - start_time}')

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
fitting time 310.402291059494


In [86]:
grid_search.best_estimator_

0,1,2
,steps,"[('tfidf', ...), ('catboost', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [87]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer(max_df=0.8)),
  ('catboost', <catboost.core.CatBoostClassifier at 0x13c1c4050>)],
 'transform_input': None,
 'verbose': False,
 'tfidf': TfidfVectorizer(max_df=0.8),
 'catboost': <catboost.core.CatBoostClassifier at 0x13c1c4050>,
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'catboost__loss_function': 'MultiClass',
 'catboost__silent': True,
 'catboost__random_state': 42,
 'catboost__

In [88]:
best_catboost = grid_search.best_estimator_

In [89]:
y_pred_train = best_catboost.predict(X_train)
y_pred_proba_train = best_catboost.predict_proba(X_train)

train_f1_macro = f1_score(y_train, y_pred_train, average='macro')
print("Train Set F1 Macro Score:")
print(train_f1_macro)

train_roc_auc = roc_auc_score(y_train, y_pred_proba_train, multi_class='ovr')
print("\nTrain Set ROC-AUC:")
print(train_roc_auc)
print()
print(classification_report(y_train, y_pred_train))

Train Set F1 Macro Score:
1.0

Train Set ROC-AUC:
1.0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        47
           3       1.00      1.00      1.00        74
           4       1.00      1.00      1.00        28

    accuracy                           1.00       192
   macro avg       1.00      1.00      1.00       192
weighted avg       1.00      1.00      1.00       192



In [90]:
y_pred = best_catboost.predict(X_test)
y_pred_proba = best_catboost.predict_proba(X_test)

test_f1_macro = f1_score(y_test, y_pred, average='macro')
test_f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("Test F1 weighted score:")
print(test_f1_weighted)

print("\nTest Set F1 Macro Score:")
print(test_f1_macro)

test_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
print("\nTest Set ROC-AUC:")
print(test_roc_auc)
print()
print(classification_report(y_test, y_pred))

metrics_test['model'].append('catboost_text')
metrics_test['f1_macro'].append(test_f1_macro)
metrics_test['f1_weighted'].append(test_f1_weighted)
metrics_test['roc_auc'].append(test_roc_auc)

Test F1 weighted score:
0.7637621040563612

Test Set F1 Macro Score:
0.7646330413772275

Test Set ROC-AUC:
0.9629375

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.75      0.38      0.50         8
           2       0.75      0.90      0.82        10
           3       0.74      0.85      0.79        20
           4       1.00      0.75      0.86         8

    accuracy                           0.78        49
   macro avg       0.80      0.78      0.76        49
weighted avg       0.79      0.78      0.76        49



# Random Forest

In [92]:
from sklearn.ensemble import RandomForestClassifier


rf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__min_df': [1, 2],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    rf_pipe,
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

start_time = time()
grid_search.fit(X_train, y_train)
print(f'Fitting time: {time() - start_time:.2f} seconds')

print("Best params:", grid_search.best_params_)
print("Best f1_macro score:", grid_search.best_score_)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
Fitting time: 69.59 seconds
Best params: {'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 100, 'tfidf__max_df': 0.8, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}
Best f1_macro score: 0.7662777632944416


In [93]:
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred_proba = grid_search.best_estimator_.predict_proba(X_test)

# F1 метрики
test_f1_macro = f1_score(y_test, y_pred, average='macro')
test_f1_weighted = f1_score(y_test, y_pred, average='weighted')
print("Test F1 weighted score:")
print(test_f1_weighted)

print("\nTest Set F1 Macro Score:")
print(test_f1_macro)

# ROC-AUC 
test_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
print("\nTest Set ROC-AUC:")
print(test_roc_auc)
print()

print(classification_report(y_test, y_pred))

metrics_test['model'].append('random_forest_text')
metrics_test['f1_macro'].append(test_f1_macro)
metrics_test['f1_weighted'].append(test_f1_weighted)
metrics_test['roc_auc'].append(test_roc_auc)

Test F1 weighted score:
0.763788592360021

Test Set F1 Macro Score:
0.7570695970695971

Test Set ROC-AUC:
0.9472083333333334

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.60      0.38      0.46         8
           2       0.82      0.90      0.86        10
           3       0.77      0.85      0.81        20
           4       0.86      0.75      0.80         8

    accuracy                           0.78        49
   macro avg       0.76      0.78      0.76        49
weighted avg       0.77      0.78      0.76        49



In [94]:
metrics_test

defaultdict(list,
            {'model': ['catboost_text', 'random_forest_text'],
             'f1_macro': [0.7646330413772275, 0.7570695970695971],
             'f1_weighted': [0.7637621040563612, 0.763788592360021],
             'roc_auc': [np.float64(0.9629375),
              np.float64(0.9472083333333334)]})

# Catboost и RandomForest с доп признакми из прошлых чекпоинтов

In [95]:
df = pd.read_csv('Processed_data.csv')
df = df[~df['target'].isin(['unit 2', '?'])]
df['qst_processed'] = df['question'].apply(preprocess_text)
encoder1 = LabelEncoder()
y = encoder1.fit_transform(df['target'])
df.drop(columns=['question', 'target', 'file'], inplace=True)
df['qst_len'] = df['qst_processed'].apply(len)

In [96]:
X = df.copy()
X.head()

Unnamed: 0,page,score,qst_processed,qst_len
0,2,2,define term brand,17
1,2,4,explain one risk jack may take set alibaba group,48
2,3,6,analyse two factor may increase demand alibaba...,52
3,4,8,discuss profit maximisation main business obje...,65
4,5,10,assess advantage paternalistic style leadershi...,70


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
cb_pipe1 = Pipeline([
    ('col_transformer', ColumnTransformer(
        transformers=[
            ('tfidf_vect', TfidfVectorizer(stop_words='english'), 'qst_processed')
        ],
        remainder='passthrough'
    )),
    ('catboost', CatBoostClassifier(loss_function='MultiClass',
                                    random_state=42,
                                    silent=True))
])


scoring = {
    'f1_macro': 'f1_macro',
    'roc_auc_ovr': 'roc_auc_ovr'
}

param_grid = {
    'col_transformer__tfidf_vect__max_df': [0.8, 0.9, 1.0],
    'col_transformer__tfidf_vect__min_df': [1, 2],
    'col_transformer__tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'col_transformer__tfidf_vect__max_df': [0.8],
    'catboost__depth': [3, 5, 7],
    'catboost__learning_rate': [0.01, 0.1, 0.3],
    'catboost__iterations': [100, 200],
    'catboost__l2_leaf_reg': [0.01, 0.1, 1, 3, 5]
}

grid_search = GridSearchCV(
    cb_pipe1,
    param_grid,
    cv=5,
    scoring='f1_macro',
    # refit='f1_macro',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

start_time = time()
grid_search.fit(X_train, y_train)

print(f'fitting time {time() - start_time}')

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
fitting time 239.54665303230286


In [99]:
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred_proba = grid_search.best_estimator_.predict_proba(X_test)

# F1 метрики
test_f1_macro = f1_score(y_test, y_pred, average='macro')
test_f1_weighted = f1_score(y_test, y_pred, average='weighted')
print("Test F1 weighted score:")
print(test_f1_weighted)

print("\nTest Set F1 Macro Score:")
print(test_f1_macro)

# ROC-AUC 
test_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
print("\nTest Set ROC-AUC:")
print(test_roc_auc)
print()

print(classification_report(y_test, y_pred))

metrics_test['model'].append('catboost_add_feats')
metrics_test['f1_macro'].append(test_f1_macro)
metrics_test['f1_weighted'].append(test_f1_weighted)
metrics_test['roc_auc'].append(test_roc_auc)

Test F1 weighted score:
0.7062721331824322

Test Set F1 Macro Score:
0.7433810375670841

Test Set ROC-AUC:
0.9556979166666666

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.75      0.38      0.50         8
           2       0.64      0.90      0.75        10
           3       0.65      0.75      0.70        20
           4       1.00      0.62      0.77         8

    accuracy                           0.71        49
   macro avg       0.81      0.73      0.74        49
weighted avg       0.74      0.71      0.71        49



In [100]:
rf_pipe1 = Pipeline([
    ('col_transformer', ColumnTransformer(
        transformers=[
            ('tfidf_vect', TfidfVectorizer(stop_words='english'), 'qst_processed')
        ],
        remainder='passthrough'
    )),
    ('rf', RandomForestClassifier(random_state=42))
])


scoring = {
    'f1_macro': 'f1_macro',
    'roc_auc_ovr': 'roc_auc_ovr'
}

param_grid = {
    'col_transformer__tfidf_vect__max_df': [0.8, 0.9, 1.0],
    'col_transformer__tfidf_vect__min_df': [1, 2],
    'col_transformer__tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    rf_pipe1,
    param_grid,
    cv=5,
    scoring='f1_macro',
    # refit='f1_macro',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

start_time = time()
grid_search.fit(X_train, y_train)

print(f'fitting time {time() - start_time}')

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
fitting time 71.99305415153503


In [101]:
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred_proba = grid_search.best_estimator_.predict_proba(X_test)

# F1 метрики
test_f1_macro = f1_score(y_test, y_pred, average='macro')
test_f1_weighted = f1_score(y_test, y_pred, average='weighted')
print("Test F1 weighted score:")
print(test_f1_weighted)

print("\nTest Set F1 Macro Score:")
print(test_f1_macro)

# ROC-AUC 
test_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
print("\nTest Set ROC-AUC:")
print(test_roc_auc)
print()

print(classification_report(y_test, y_pred))

metrics_test['model'].append('randomforest_add_feats')
metrics_test['f1_macro'].append(test_f1_macro)
metrics_test['f1_weighted'].append(test_f1_weighted)
metrics_test['roc_auc'].append(test_roc_auc)

Test F1 weighted score:
0.7170974544532663

Test Set F1 Macro Score:
0.7461321287408244

Test Set ROC-AUC:
0.9326666666666668

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.67      0.25      0.36         8
           2       0.69      0.90      0.78        10
           3       0.67      0.80      0.73        20
           4       1.00      0.75      0.86         8

    accuracy                           0.73        49
   macro avg       0.81      0.74      0.75        49
weighted avg       0.75      0.73      0.72        49



In [109]:
df_metrics = pd.DataFrame(metrics_test)

# преписка _text в названии модели означает что обучено только на текстах, без дополнительных признаков
print(df_metrics.to_markdown(index=False))

| model                  |   f1_macro |   f1_weighted |   roc_auc |
|:-----------------------|-----------:|--------------:|----------:|
| catboost_text          |   0.764633 |      0.763762 |  0.962938 |
| random_forest_text     |   0.75707  |      0.763789 |  0.947208 |
| catboost_add_feats     |   0.743381 |      0.706272 |  0.955698 |
| randomforest_add_feats |   0.746132 |      0.717097 |  0.932667 |
