# Logistic Regression for topic classification
---

In [1]:
import numpy as np
import pandas as pd
import sys
import os
import json

In [2]:
module_path = os.path.abspath(os.path.join('..\..')) # Path to root folder
if module_path not in sys.path:
    sys.path.append(module_path + "/scripts") # define scripts path

from ipynb_func import *

Data loader:

In [3]:
#NUM = 10 # Number of data parquets to use
#assert NUM >= 1 and NUM <= 10, "NUM value must be in range [1, 10]"

# Making list of roots to merge processed raw data 
#paths = [module_path + f"/data/pikabu/tag_processed/raw_data/{i}_tag_processed.parquet" for i in range(NUM)] 

# Making list of roots to merge processed filtered data
#paths = [module_path + f"/data/pikabu/tag_processed/filtered_data/{i}_tag_processed.parquet" for i in range(NUM)] 

# Making list of roots to merge processed cleared data
paths = [module_path + f"/data/pikabu/splited_data/cleared_texts.parquet"] 

data = merge_dataset(paths)

In [4]:
pd.set_option('display.max_colwidth', 180)
data.head(3)

Unnamed: 0,id,text_markdown,tags
15,6991359,"[добрый, сутки, господин, дама, подсказывать, название, игра, телефон, оформление, убийство, зомби, очки, ездить, машинка, крутить, развивать, скорость, заранее, благодарить]","[игры, поиск]"
37,7004423,"[ехать, девчонка, школа, оставаться, свободный, макс, заявка, прямой, конечный, адрес, железнодорожный, институт, включать, вбивать, адрес, выдавать, столовая, ладно, садиться,...",[юмор]
52,6991603,"[стадо, стадо, гигантский, случаться, стадо, управлять, волк, предел, волк, жопа, враг, дружно, осматривать, выдавливать, стадо, выдавливать, съедать, волк, близкий, холм, обхо...",[мат]


---
# 1. Data preparation and split

In [5]:
with open(module_path + f"/data/pikabu/splited_data/indexes.json") as f:
    id_splits = f.read()

id_splits = json.loads(id_splits)

data_train = data[data['id'].isin(id_splits['train'])]
data_val = data[data['id'].isin(id_splits['val'])]
data_test = data[data['id'].isin(id_splits['test'])]

In [6]:
print(f"Number of train data: {len(data_train)}")
print(f"Number of val data: {len(data_val)}")
print(f"Number of test data: {len(data_test)}")
print(f"Distribution: {len(data_train)/len(data)*100:.0f} / {len(data_val)/len(data)*100:.0f} / {len(data_test)/len(data)*100:.0f}")

Number of train data: 25209
Number of val data: 2821
Number of test data: 3119
Distribution: 81 / 9 / 10


---
# 2. Model training

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from joblib import dump, load

Target preparation:

In [8]:
Vec = CountVectorizer(tokenizer=lambda x: x.split(','), binary=True)

df = data.copy()
df.tags = [','.join(i) for i in df.tags]

df_train = data_train.copy()
df_train.tags = [','.join(i) for i in df_train.tags]

df_val = data_val.copy()
df_val.tags = [','.join(i) for i in df_val.tags]

df_test = data_test.copy()
df_test.tags = [','.join(i) for i in df_test.tags]

y_data = Vec.fit(df['tags'])
y_train = Vec.transform(df_train['tags'])
y_val = Vec.transform(df_val['tags'])
y_test = Vec.transform(df_test['tags'])



In [9]:
print('Tags to predict:')
print(Vec.get_feature_names_out())

Tags to predict:
['авто' 'авторский рассказ' 'алкоголь' 'анекдот' 'армия' 'вопрос' 'врачи'
 'девушки' 'деньги' 'дети' 'детство' 'другое' 'жизнь' 'игры' 'интересное'
 'истории' 'история' 'ищу книгу' 'ищу фильм' 'карантин' 'книги'
 'коронавирус' 'кот' 'лига добра' 'лига юристов' 'любовь' 'люди' 'мат'
 'медицина' 'москва' 'музыка' 'мысли' 'негатив' 'новости' 'новый год'
 'общество' 'отношения' 'поиск' 'политика' 'помогите найти' 'помощь'
 'психология' 'работа' 'рассказ' 'реальная история из жизни'
 'родители и дети' 'россия' 'самоизоляция' 'санкт-петербург' 'семья'
 'случай из жизни' 'совет' 'сон' 'соседи' 'стихи' 'украина' 'фантастика'
 'фильмы' 'школа' 'юмор']


In [10]:
tag_distr = getworddict(getwordlist(data.tags))
tag_distr_formated = {}
for i in range(len(tag_distr)):
    tag_distr_formated[i] = round(tag_distr[Vec.get_feature_names_out()[i]] / sum(tag_distr.values()), 4)

In [11]:
print('Tags weights:')
print(tag_distr_formated)

Tags weights:
{0: 0.0073, 1: 0.0121, 2: 0.0075, 3: 0.0079, 4: 0.0085, 5: 0.0196, 6: 0.0077, 7: 0.0138, 8: 0.0089, 9: 0.0279, 10: 0.011, 11: 0.0221, 12: 0.0186, 13: 0.0122, 14: 0.0122, 15: 0.0086, 16: 0.0338, 17: 0.0067, 18: 0.008, 19: 0.014, 20: 0.0107, 21: 0.0454, 22: 0.0121, 23: 0.0142, 24: 0.0078, 25: 0.0149, 26: 0.0084, 27: 0.0549, 28: 0.0125, 29: 0.0098, 30: 0.0081, 31: 0.0093, 32: 0.0145, 33: 0.0152, 34: 0.0133, 35: 0.0237, 36: 0.0206, 37: 0.0071, 38: 0.0308, 39: 0.0118, 40: 0.054, 41: 0.0121, 42: 0.0297, 43: 0.0303, 44: 0.0315, 45: 0.0082, 46: 0.0277, 47: 0.0063, 48: 0.0075, 49: 0.0145, 50: 0.0122, 51: 0.0078, 52: 0.0067, 53: 0.0086, 54: 0.0274, 55: 0.0275, 56: 0.0111, 57: 0.0104, 58: 0.0144, 59: 0.0384}


In [12]:
print('Y shapes:')
print(f'  • Y train: {y_train.shape}')
print(f'  • Y validation: {y_val.shape}')
print(f'  • Y test: {y_test.shape}')

Y shapes:
  • Y train: (25209, 60)
  • Y validation: (2821, 60)
  • Y test: (3119, 60)


---

## 2.1. Training with bag-of-words embeddings:

In [13]:
save_models_path = module_path + '/models/logreg/'

In [14]:
X_data = [' '.join(txt) for txt in data.text_markdown]

In [15]:
X_train = [' '.join(txt) for txt in data_train.text_markdown]
X_val = [' '.join(txt) for txt in data_val.text_markdown]
X_test = [' '.join(txt) for txt in data_test.text_markdown]

X_Vec = CountVectorizer(tokenizer = lambda x: x.split())

X_Vec.fit(X_train)
X_train = X_Vec.transform(X_train)
X_test = X_Vec.transform(X_test)
X_val = X_Vec.transform(X_val)

In [16]:
print("X BoW's shapes:")
print(f'   • X train shape: {X_train.shape}')
print(f'   • X val shape: {X_val.shape}')
print(f'   • X test shape: {X_test.shape}')

X BoW's shapes:
   • X train shape: (25209, 5899)
   • X val shape: (2821, 5899)
   • X test shape: (3119, 5899)


In [17]:
""" 
# Searching for best model params; too long;

from sklearn.model_selection import GridSearchCV

LogReg_cfg = {'estimator__C':[1e3, 1e5, 1e7, 1e8],
              'estimator__penalty': ['elasticnet', 'l1', 'l2'],
              'estimator__dual': [False],
              'estimator__class_weight': [None, tag_distr_formated],
              'estimator__solver': ['lbfgs', 'liblinear', 'newton-cg'],
              'estimator__random_state': [42]}

clf_ovr = OneVsRestClassifier(estimator=LogisticRegression(),
                              n_jobs=-1)

GSCV_clf = GridSearchCV(estimator=clf_ovr, param_grid=LogReg_cfg)

GSCV_clf.fit(X_train, y_train)

GSCV_clf.best_params_ 

""";

In [18]:
LogReg_cfg = {'C':5e7,
              'penalty': 'l2',
              'dual': False,
              'class_weight': tag_distr_formated,
              'solver': 'liblinear',
              'random_state': 42}

clf_ovr = OneVsRestClassifier(estimator=LogisticRegression(C=LogReg_cfg['C'],
                                                           dual=LogReg_cfg['dual'],
                                                           class_weight=LogReg_cfg['class_weight'],
                                                           penalty=LogReg_cfg['penalty'],
                                                           solver=LogReg_cfg['solver'],
                                                           random_state=LogReg_cfg['random_state']),
                              n_jobs=-1)

In [19]:
if os.path.isfile(save_models_path + 'bow.joblib'):
    clf_ovr = load(save_models_path + 'bow.joblib')
else:
    clf_ovr.fit(X_train, y_train)
    dump(clf_ovr, save_models_path + 'bow.joblib')

In [20]:
y_pred_val = clf_ovr.predict(X_val)

df_val = data_val.copy()
df_val['predicted_tags'] = Vec.inverse_transform(y_pred_val)

In [21]:
print('Metrics for Bag-of-Words:')
print(classification_report(y_val, y_pred_val))

Metrics for Bag-of-Words:
              precision    recall  f1-score   support

           0       0.33      0.28      0.30        25
           1       0.26      0.22      0.24        45
           2       0.48      0.30      0.37        40
           3       0.23      0.26      0.24        27
           4       0.46      0.44      0.45        27
           5       0.09      0.13      0.10        68
           6       0.43      0.33      0.37        40
           7       0.23      0.27      0.25        59
           8       0.10      0.10      0.10        42
           9       0.24      0.32      0.28       113
          10       0.25      0.26      0.26        50
          11       0.04      0.05      0.05        85
          12       0.06      0.10      0.08        72
          13       0.51      0.52      0.51        58
          14       0.02      0.02      0.02        52
          15       0.03      0.04      0.03        28
          16       0.13      0.16      0.14       135
 

  _warn_prf(average, modifier, msg_start, len(result))


Calculate `recall@k`:

In [22]:
bow_recallk = recallk(df_val.tags, df_val.predicted_tags)
print(f'Recall@k for Bag-of-Words: {bow_recallk:.4f}')

Recall@k for Bag-of-Words: 0.4194


## 2.2 Training with IF-IDF embeddings:

In [23]:
X_train = [' '.join(txt) for txt in data_train.text_markdown]
X_val = [' '.join(txt) for txt in data_val.text_markdown]
X_test = [' '.join(txt) for txt in data_test.text_markdown]

Tfidf_Vec = TfidfVectorizer(tokenizer = lambda x: x.split())

Tfidf_Vec.fit(X_train)
X_train = Tfidf_Vec.transform(X_train)
X_test = Tfidf_Vec.transform(X_test)
X_val = Tfidf_Vec.transform(X_val)



In [24]:
print("X TF-IDF's shapes:")
print(f'   • X train shape: {X_train.shape}')
print(f'   • X val shape: {X_val.shape}')
print(f'   • X test shape: {X_test.shape}')

X TF-IDF's shapes:
   • X train shape: (25209, 5899)
   • X val shape: (2821, 5899)
   • X test shape: (3119, 5899)


In [25]:
LogReg_cfg = {'C':5e7,
              'penalty': 'l2',
              'dual': False,
              'class_weight': tag_distr_formated,
              'solver': 'liblinear',
              'random_state': 42}

clf_ovr = OneVsRestClassifier(estimator=LogisticRegression(C=LogReg_cfg['C'],
                                                           dual=LogReg_cfg['dual'],
                                                           class_weight=LogReg_cfg['class_weight'],
                                                           penalty=LogReg_cfg['penalty'],
                                                           solver=LogReg_cfg['solver'],
                                                           random_state=LogReg_cfg['random_state']),
                              n_jobs=-1)

In [26]:
if os.path.isfile(save_models_path + 'tf_idf.joblib'):
    clf_ovr = load(save_models_path + 'tf_idf.joblib')
else:
    clf_ovr.fit(X_train, y_train)
    dump(clf_ovr, save_models_path + 'tf_idf.joblib')

In [27]:
y_pred_val = clf_ovr.predict(X_val)

df_val = data_val.copy()
df_val['predicted_tags'] = Vec.inverse_transform(y_pred_val)

In [28]:
print('Metrics for TF-IDF:')
print(classification_report(y_val, y_pred_val, zero_division=0))

Metrics for TF-IDF:
              precision    recall  f1-score   support

           0       0.43      0.24      0.31        25
           1       0.32      0.24      0.28        45
           2       0.68      0.33      0.44        40
           3       0.41      0.26      0.32        27
           4       0.55      0.44      0.49        27
           5       0.12      0.13      0.13        68
           6       0.52      0.28      0.36        40
           7       0.28      0.25      0.27        59
           8       0.09      0.05      0.06        42
           9       0.31      0.35      0.33       113
          10       0.33      0.20      0.25        50
          11       0.03      0.05      0.04        85
          12       0.10      0.14      0.12        72
          13       0.63      0.47      0.53        58
          14       0.03      0.02      0.02        52
          15       0.00      0.00      0.00        28
          16       0.13      0.17      0.15       135
       

Calculate `recall@k`:

In [29]:
tf_idf_recallk = recallk(df_val.tags, df_val.predicted_tags)
print(f'Recall@k on TF-IDF: {tf_idf_recallk:.4f}')

Recall@k on TF-IDF: 0.4133


## 2.3. Training on TF-IDF with N-grams:

In [30]:
X_train = [' '.join(txt) for txt in data_train.text_markdown]
X_val = [' '.join(txt) for txt in data_val.text_markdown]
X_test = [' '.join(txt) for txt in data_test.text_markdown]

Tfidf_Vec = TfidfVectorizer(tokenizer = lambda x: x.split(), ngram_range=(1, 2))

Tfidf_Vec.fit(X_train)
X_train = Tfidf_Vec.transform(X_train)
X_test = Tfidf_Vec.transform(X_test)
X_val = Tfidf_Vec.transform(X_val)



In [31]:
print("X TF-IDF with n-grams's shapes:")
print(f'   • X train shape: {X_train.shape}')
print(f'   • X val shape: {X_val.shape}')
print(f'   • X test shape: {X_test.shape}')

X TF-IDF with n-grams's shapes:
   • X train shape: (25209, 1654803)
   • X val shape: (2821, 1654803)
   • X test shape: (3119, 1654803)


In [32]:
LogReg_cfg = {'C':1e8,
              'penalty': 'l2',
              'dual': False,
              'class_weight': tag_distr_formated,
              'solver': 'lbfgs',
              'random_state': 42}

clf_ovr = OneVsRestClassifier(estimator=LogisticRegression(C=LogReg_cfg['C'],
                                                           dual=LogReg_cfg['dual'],
                                                           class_weight=LogReg_cfg['class_weight'],
                                                           penalty=LogReg_cfg['penalty'],
                                                           solver=LogReg_cfg['solver'],
                                                           random_state=LogReg_cfg['random_state']),
                              n_jobs=-1)

In [33]:
if os.path.isfile(save_models_path + 'tf_idf_ngrams.joblib'):
    clf_ovr = load(save_models_path + 'tf_idf_ngrams.joblib')
else:
    clf_ovr.fit(X_train, y_train)
    dump(clf_ovr, save_models_path + 'tf_idf_ngrams.joblib')

In [34]:
y_pred_val = clf_ovr.predict(X_val)

df_val = data_val.copy()
df_val['predicted_tags'] = Vec.inverse_transform(y_pred_val)

In [35]:
print('Metrics for TF-IDF with 1-2 n-grams:')
print(classification_report(y_val, y_pred_val, zero_division=0))

Metrics for TF-IDF with 1-2 n-grams:
              precision    recall  f1-score   support

           0       0.47      0.28      0.35        25
           1       0.36      0.27      0.31        45
           2       0.58      0.28      0.37        40
           3       0.71      0.19      0.29        27
           4       0.60      0.44      0.51        27
           5       0.39      0.10      0.16        68
           6       0.45      0.25      0.32        40
           7       0.50      0.19      0.27        59
           8       0.11      0.02      0.04        42
           9       0.47      0.32      0.38       113
          10       0.46      0.32      0.38        50
          11       0.17      0.01      0.02        85
          12       0.22      0.06      0.09        72
          13       0.68      0.52      0.59        58
          14       0.25      0.02      0.04        52
          15       0.00      0.00      0.00        28
          16       0.34      0.10      0.16 

Calculate `recall@k`:

In [36]:
n_gram_tf_idf_recallk = recallk(df_val.tags, df_val.predicted_tags)
print(f'Recall@k for TF-IDF with 1-2 n-grams: {n_gram_tf_idf_recallk:.4f}')

Recall@k for TF-IDF with 1-2 n-grams: 0.4016


## 2.4. Training on rubert-tiny-v2 embeddings:

In [37]:
emb_paths = module_path + '/data/embeddings/rubert-tiny-v2/'

emb_pth = [emb_paths + 'texts.parquet']
emb = merge_dataset(emb_pth)

In [38]:
emb.head(2)

Unnamed: 0,id,embedding
0,2936217,"[0.3411005, -0.16877297, -0.3599054, 0.011505239, -0.19693527, 0.16206133, -0.62560713, -0.38459125, -0.08364315, -0.17384137, -0.2905479, 0.8844394, -0.07451144, 1.9678769, 0...."
1,6991412,"[0.3696494, 0.06409113, -0.62138826, -0.8906186, 0.08984075, 0.27482352, -0.31647494, -0.778525, -0.6068895, 0.42193377, -0.05778958, 0.017193496, 0.14765958, 0.35776424, -0.16..."


In [39]:
emb = emb[emb['id'].isin(data['id'])]

emb_train = emb[emb['id'].isin(data_train['id'])]
emb_val = emb[emb['id'].isin(data_val['id'])]
emb_test = emb[emb['id'].isin(data_test['id'])]

assert len(emb_train) == len(data_train), "Something went wrong!"
assert len(emb_val) == len(data_val), "Something went wrong!"
assert len(emb_test) == len(data_test), "Something went wrong!"

In [40]:
X_train_emb = [i for i in emb_train.embedding]
X_val_emb = [i for i in emb_val.embedding]
X_test_emb = [i for i in emb_test.embedding]

In [41]:
print("X embeddings shapes:")
print(f'   • X train shape: {np.shape(X_train_emb)}')
print(f'   • X val shape: {np.shape(X_val_emb)}')
print(f'   • X test shape: {np.shape(X_test_emb)}')

X embeddings shapes:
   • X train shape: (25209, 312)
   • X val shape: (2821, 312)
   • X test shape: (3119, 312)


In [42]:
LogReg_cfg = {'C':1e8,
              'penalty': 'l2',
              'dual': False,
              'class_weight': tag_distr_formated,
              'solver': 'lbfgs',
              'random_state': 42}

clf_ovr = OneVsRestClassifier(estimator=LogisticRegression(C=LogReg_cfg['C'],
                                                           dual=LogReg_cfg['dual'],
                                                           class_weight=LogReg_cfg['class_weight'],
                                                           penalty=LogReg_cfg['penalty'],
                                                           solver=LogReg_cfg['solver'],
                                                           random_state=LogReg_cfg['random_state']),
                              n_jobs=-1)

In [43]:
if os.path.isfile(save_models_path + 'rubert.joblib'):
    clf_ovr = load(save_models_path + 'rubert.joblib')
else:
    clf_ovr.fit(X_train_emb, y_train)
    dump(clf_ovr, save_models_path + 'rubert.joblib')

In [44]:
y_pred_val = clf_ovr.predict(X_val_emb)

df_val = data_val.copy()
df_val['predicted_tags'] = Vec.inverse_transform(y_pred_val)

In [45]:
print('Metrics for rubert embeddings:')
print(classification_report(y_val, y_pred_val, zero_division=0))

Metrics for rubert embeddings:
              precision    recall  f1-score   support

           0       0.31      0.32      0.31        25
           1       0.39      0.31      0.35        45
           2       0.58      0.17      0.27        40
           3       0.47      0.33      0.39        27
           4       0.35      0.30      0.32        27
           5       0.16      0.04      0.07        68
           6       0.29      0.15      0.20        40
           7       0.36      0.20      0.26        59
           8       0.35      0.17      0.23        42
           9       0.42      0.34      0.37       113
          10       0.30      0.22      0.25        50
          11       0.10      0.01      0.02        85
          12       0.25      0.01      0.03        72
          13       0.63      0.59      0.61        58
          14       0.00      0.00      0.00        52
          15       0.00      0.00      0.00        28
          16       0.25      0.03      0.05       

Calculate `recall@k`:

In [46]:
rubert_recallk = recallk(df_val.tags, df_val.predicted_tags)
print(f'Recall@k for model with rubert embeddings: {rubert_recallk:.4f}')

Recall@k for model with rubert embeddings: 0.3761


---
# 3. Results

As the result of training with different embeddings, we have the following:

In [47]:
print("Recall@k's for models:\n")
print(f'Recall@k for Bag-of-Words: {bow_recallk:3f}')
print(f'Recall@k for TF-IDF: {tf_idf_recallk:3f}')
print(f'Recall@k for TF-IDF with uni- and bi- grams: {n_gram_tf_idf_recallk:3f}')
print(f'Recall@k for rubert embeddings: {rubert_recallk:3f}')

Recall@k's for models:

Recall@k for Bag-of-Words: 0.419355
Recall@k for TF-IDF: 0.413329
Recall@k for TF-IDF with uni- and bi- grams: 0.401631
Recall@k for rubert embeddings: 0.376108
