In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.preprocessing import label_binarize
import numpy as np
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

In [22]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
df = pd.read_csv('Processed_data.csv')

In [24]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    doc = nlp(' '.join(filtered_tokens))
    lemmatized_tokens = ' '.join([token.lemma_ for token in doc])
    return lemmatized_tokens

In [25]:
df['qst_processed'] = df['question'].apply(preprocess_text)
df = df[~df['target'].isin(['unit 2', '?'])]
df_init = df.copy()

In [26]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['target'])
target = df['target']

df.drop(columns=['question', 'target', 'file', 'page', 'score'], inplace=True)

In [28]:
target

0      marketing mix and strategy
1       entrepreneurs and leaders
2                      the market
3       entrepreneurs and leaders
4                 managing people
                  ...            
262                    the market
263        meeting customer needs
266               managing people
267                    the market
269               managing people
Name: target, Length: 241, dtype: object

In [27]:
df

Unnamed: 0,qst_processed
0,define term brand
1,explain one risk jack may take set alibaba group
2,analyse two factor may increase demand alibaba...
3,discuss profit maximisation main business obje...
4,assess advantage paternalistic style leadershi...
...,...
262,assess two factor could influence price elasti...
263,assess whether likely target young market segm...
266,explain flexible workforce might benefit ssp
267,construct supply demand diagram illustrate lik...


In [37]:
X = df['qst_processed'].copy()

In [38]:
X

0                                      define term brand
1       explain one risk jack may take set alibaba group
2      analyse two factor may increase demand alibaba...
3      discuss profit maximisation main business obje...
4      assess advantage paternalistic style leadershi...
                             ...                        
262    assess two factor could influence price elasti...
263    assess whether likely target young market segm...
266         explain flexible workforce might benefit ssp
267    construct supply demand diagram illustrate lik...
269    assess likely benefit company ssp use techniqu...
Name: qst_processed, Length: 241, dtype: object

In [39]:
np.unique(y)

array([0, 1, 2, 3, 4])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
from time import time

catboost_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('catboost', CatBoostClassifier(loss_function='MultiClass',
                                    random_state=42, 
                                    silent=True)),
])

scoring = {
    'f1_macro': 'f1_macro',
    'roc_auc_ovr': 'roc_auc_ovr'
}

param_grid = {
    # 'tfidf__max_df': [0.8, 0.9, 1.0],
    # 'tfidf__min_df': [1, 2],
    # 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3),],
    'tfidf__max_df': [0.8],
    'catboost__depth': [3, 5, 7, 10, 11],
    'catboost__learning_rate': [0.01, 0.1, 0.5],
    'catboost__iterations': [100, 200],
    'catboost__l2_leaf_reg': [0.1, 1, 3, 5, None],
}

grid_search = GridSearchCV(
    catboost_pipe,
    param_grid,
    cv=5,
    scoring='f1_macro',
    # refit='f1_macro',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

start_time = time()
grid_search.fit(X_train, y_train)

print(f'fitting time {time() - start_time}')

Fitting 5 folds for each of 150 candidates, totalling 750 fits


fitting time 330.17216086387634


In [83]:
grid_search.best_estimator_

In [84]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer(max_df=0.8)),
  ('catboost', <catboost.core.CatBoostClassifier at 0x323d9eff0>)],
 'transform_input': None,
 'verbose': False,
 'tfidf': TfidfVectorizer(max_df=0.8),
 'catboost': <catboost.core.CatBoostClassifier at 0x323d9eff0>,
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 0.8,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'catboost__loss_function': 'MultiClass',
 'catboost__silent': True,
 'catboost__random_state': 42,
 'catboost__

In [85]:
best_catboost = grid_search.best_estimator_

In [86]:
y_pred_train = best_catboost.predict(X_train)
y_pred_proba_train = best_catboost.predict_proba(X_train)

train_f1_macro = f1_score(y_train, y_pred_train, average='macro')
print("Train Set F1 Macro Score:")
print(train_f1_macro)

train_roc_auc = roc_auc_score(y_train, y_pred_proba_train, multi_class='ovr')
print("\nTrain Set ROC-AUC:")
print(train_roc_auc)
print()
print(classification_report(y_train, y_pred_train))

Train Set F1 Macro Score:
1.0

Train Set ROC-AUC:
1.0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        47
           3       1.00      1.00      1.00        74
           4       1.00      1.00      1.00        28

    accuracy                           1.00       192
   macro avg       1.00      1.00      1.00       192
weighted avg       1.00      1.00      1.00       192



In [87]:
y_pred = best_catboost.predict(X_test)
y_pred_proba = best_catboost.predict_proba(X_test)

test_f1_macro = f1_score(y_test, y_pred, average='macro')
print("Test Set F1 Macro Score:")
print(test_f1_macro)

test_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
print("\nTest Set ROC-AUC:")
print(test_roc_auc)
print()
print(classification_report(y_test, y_pred))

Test Set F1 Macro Score:
0.6767371758676106

Test Set ROC-AUC:
0.9551562499999999

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.50      0.12      0.20         8
           2       0.75      0.90      0.82        10
           3       0.65      0.85      0.74        20
           4       1.00      0.62      0.77         8

    accuracy                           0.71        49
   macro avg       0.73      0.70      0.68        49
weighted avg       0.71      0.71      0.68        49

