# Logistic Regression for topic classification
---

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import os
import json

In [2]:
module_path = os.path.abspath(os.path.join('..\..')) # Path to root folder
if module_path not in sys.path:
    sys.path.append(module_path + "/scripts") # define scripts path

from ipynb_func import *

Data loader:

In [3]:
NUM = 10 # Number of data parquets to use
assert NUM >= 1 and NUM <= 10, "NUM value must be in range [1, 10]"

# Making list of roots to merge processed raw data 
#paths = [module_path + f"/data/pikabu/tag_processed/raw_data/{i}_tag_processed.parquet" for i in range(NUM)] 

# Making list of roots to merge processed filtered data
#paths = [module_path + f"/data/pikabu/tag_processed/filtered_data/{i}_tag_processed.parquet" for i in range(NUM)] 

# Making list of roots to merge processed cleared data
paths = [module_path + f"/data/pikabu/splited_data/cleared_texts.parquet"] 

data = merge_dataset(paths)

In [4]:
data.head(3)

Unnamed: 0,id,text_markdown,tags
15,6991359,"[добрый, сутки, господин, дама, подсказывать, ...","[игры, поиск]"
37,7004423,"[ехать, девчонка, школа, оставаться, свободный...",[юмор]
52,6991603,"[стадо, стадо, гигантский, случаться, стадо, у...",[мат]


In [5]:
with open(module_path + f"/data/pikabu/splited_data/indexes.json") as f:
    id_splits = f.read()

id_splits = json.loads(id_splits)

data_train = data[data['id'].isin(id_splits['train'])]
data_test = data[data['id'].isin(id_splits['test'])]
data_val = data[data['id'].isin(id_splits['val'])]

---

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier

In [7]:
X_data = [' '.join(txt) for txt in data.text_markdown]
X_train = [' '.join(txt) for txt in data_train.text_markdown]
X_val = [' '.join(txt) for txt in data_val.text_markdown]
X_test = [' '.join(txt) for txt in data_test.text_markdown]

In [8]:
Tfidf_Vec = TfidfVectorizer(tokenizer = lambda x: x.split())

Tfidf_Vec.fit(X_data)
X_train = Tfidf_Vec.transform(X_train)
X_test = Tfidf_Vec.transform(X_test)
X_val = Tfidf_Vec.transform(X_val)



In [9]:
Vec = CountVectorizer(tokenizer=lambda x: x.split(','), binary=True)

df = data.copy()
df.tags = [','.join(i) for i in df.tags]

df_train = data_train.copy()
df_train.tags = [','.join(i) for i in df_train.tags]

df_val = data_val.copy()
df_val.tags = [','.join(i) for i in df_val.tags]

df_test = data_test.copy()
df_test.tags = [','.join(i) for i in df_test.tags]

y_data = Vec.fit(df['tags'])
y_train = Vec.transform(df_train['tags'])
y_val = Vec.transform(df_val['tags'])
y_test = Vec.transform(df_test['tags'])

In [10]:
print(f'Number of tags: {len(getworddict(getwordlist(data.tags)))}')
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'y_val shape: {y_val.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

Number of tags: 60
X_train shape: (25209, 5899)
y_train shape: (25209, 60)
X_val shape: (2821, 5899)
y_val shape: (2821, 60)
X_test shape: (3119, 5899)
y_test shape: (3119, 60)


In [11]:
clf = OneVsRestClassifier(LogisticRegression(C=1e3, 
                                             random_state=42))

In [12]:
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [13]:
y_pred_val = clf.predict(X_val)

df_val = data_val.copy()
df_val['predicted_tags'] = Vec.inverse_transform(y_pred_val)

In [18]:
df_val.head(10)

Unnamed: 0,id,text_markdown,tags,predicted_tags
421,6992880,"[популярный, пк, игра, создавать, устройство, ...",[помогите найти],[]
432,6992917,"[профессия, оказываться, сопровождать, образов...",[психология],[]
578,6994231,"[предыдущий, пост, бригада, график, переписыва...","[юмор, реальная история из жизни]",[мат]
591,6992488,"[широкий, хотеться, обращать, внимание, инциде...",[помощь],[новости]
807,7021892,"[история, скоро, девушка, упасть, дерево, паре...","[рассказ, фантастика, мат]","[авторский рассказ, история, рассказ, фантастика]"
830,7159946,"[момент, кончаться, поздравление, лень, истори...",[новый год],"[помощь, реальная история из жизни]"
849,7079530,"[отменять, понедельник, хватить, военный, танк...",[россия],"[коронавирус, стихи]"
868,7125770,"[воспитание, родитель, учить, сын, девочка, об...",[дети],[общество]
905,7047027,"[успешно, запускать, космос, партия, спутник, ...",[новости],[]
973,7164278,"[одноразовый, неделька, забывать, разбегаться,...","[стихи, общество]",[стихи]


In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.50      0.24      0.32        25
           1       0.37      0.22      0.28        45
           2       0.65      0.28      0.39        40
           3       0.50      0.22      0.31        27
           4       0.58      0.41      0.48        27
           5       0.15      0.12      0.13        68
           6       0.56      0.25      0.34        40
           7       0.36      0.25      0.30        59
           8       0.07      0.02      0.04        42
           9       0.33      0.30      0.31       113
          10       0.38      0.18      0.24        50
          11       0.03      0.04      0.03        85
          12       0.10      0.10      0.10        72
          13       0.66      0.47      0.55        58
          14       0.06      0.02      0.03        52
          15       0.00      0.00      0.00        28
          16       0.15      0.14      0.14       135
          17       0.78    

  _warn_prf(average, modifier, msg_start, len(result))
