In [1]:
import os.path

import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
import time
from datetime import datetime as dt
from string import punctuation

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# TODO make as parameters
MAX_DF = 0.8
MIN_DF = 200
N_JOBS = -1
INPUT_DIR = "/content/drive/MyDrive/project_data/full_data"
OUTPUT_DIR = "/content/drive/MyDrive/project_data/full_data"


In [8]:
def read_dataset(input_dir, subset):
    """
    Reads and drops NAs
    :param input_dir: where the (train.csv, test.csv, valid.csv, ...) are located
    :param string subset: the name of the subset to read (train, test, valid, ...)
    :return: pd.DataFrame
    """
    df = pd.read_csv(os.path.join(input_dir, f"{subset}.csv"), header=0)
    initial_shape = df.shape[0]
    df = df.dropna(how="any")
    curr_shape = df.shape[0]
    print(f"There were {curr_shape - initial_shape} empty examples dropped.",
          f"{subset.upper()} set has {curr_shape} rows.")
    return df
(train, valid, test) = (read_dataset(INPUT_DIR, subset) for subset in ["train", "valid", "test"])

There were -36145 empty examples dropped. TRAIN set has 532921 rows.
There were -6002 empty examples dropped. VALID set has 88842 rows.
There were -6196 empty examples dropped. TEST set has 88649 rows.


In [None]:
class Timeit(object):
    def __init__(self, description):
        self.description = description

    def __enter__(self):
        self.start_time = time.time()

    def __exit__(self, type, value, traceback):
        ex_time = time.time() - self.start_time
        print(self.description, ex_time, "seconds")
        return True

In [None]:
logreg = Pipeline(steps=[("vectorizer", TfidfVectorizer(max_df=MAX_DF,
                                                            min_df=MIN_DF,
                                                            stop_words=russian_stopwords,
                                                            token_pattern=u'(?ui)\\b\\w*[а-я]+\\w*\\b')),
                         ("log_reg", LogisticRegression(n_jobs=N_JOBS,
                                                           solver="saga",
                                                           multi_class="multinomial",
                                                           random_state=100500))
                             ],
                  verbose=True)

# train the pipeline
logreg.fit(train["description"], train["targetcat"])

There were -36145 empty examples dropped. TRAIN set has 532921 rows.
There were -6002 empty examples dropped. VALID set has 88842 rows.
There were -6196 empty examples dropped. TEST set has 88649 rows.
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total= 1.5min
[Pipeline] .......... (step 2 of 2) Processing log_reg, total=267.7min


In [None]:
logreg.named_steps

{'log_reg': LogisticRegression(multi_class='multinomial', n_jobs=-1, random_state=100500,
                    solver='saga'),
 'vectorizer': TfidfVectorizer(max_df=0.8, min_df=200,
                 stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...],
                 token_pattern='(?ui)\\b\\w*[а-я]+\\w*\\b')}

In [None]:
suffix = 'test_run'
output_dir = os.path.join(OUTPUT_DIR, suffix)

vectorizer = logreg.named_steps["vectorizer"]
joblib.dump(vectorizer, os.path.join(output_dir, "tf_idf_vectorizer"), compress=3)

['/content/drive/MyDrive/project_data/full_data/test_run/tf_idf_vectorizer']

In [None]:
log_reg = logreg.named_steps["log_reg"]
joblib.dump(log_reg, os.path.join(output_dir, "model_log_reg.joblib"), compress=3)

['/content/drive/MyDrive/project_data/full_data/test_run/model_log_reg.joblib']

In [9]:
log_red_loaded = joblib.load('/content/drive/MyDrive/project_data/full_data/test_run/model_log_reg.joblib')
vectorizer_loaded = joblib.load('/content/drive/MyDrive/project_data/full_data/test_run/tf_idf_vectorizer')
logreg = Pipeline(steps=[("vectorizer", vectorizer_loaded),
                             ("log_reg", log_red_loaded)
                             ],
                      verbose=True)
# evaluation on validation
val_preds = logreg.predict(valid["description"])
val_f1_scores_by_category = f1_score(valid["targetcat"], val_preds, average=None)
val_f1_score_macro = np.mean(val_f1_scores_by_category)
print(f"F1 Macro on validation is: {val_f1_score_macro}")

F1 Macro on validation is: 0.5017989551079045


In [10]:
for score in val_f1_scores_by_category:
  print(score)


0.8571428571428571
0.9047619047619048
0.0
0.0
0.0
0.8444444444444444
0.33333333333333337
0.0
0.4615384615384615
0.25
0.0
0.0
0.7794117647058824
0.962962962962963
0.0
0.8292682926829269
0.6
0.0
0.786206896551724
0.7878787878787877
0.6423357664233577
0.0
0.5
0.9076923076923077
0.4
0.7499999999999999
0.15384615384615385
0.9090909090909091
0.0
0.782608695652174
0.8
0.0
0.9387755102040817
0.9411764705882353
0.923076923076923
0.8484848484848485
1.0
0.0
0.0
0.910958904109589
0.888888888888889
0.6222222222222222
0.0
0.9466666666666667
0.7920792079207921
0.8807339449541285
0.8701298701298701
0.0
0.888888888888889
0.8933333333333333
0.7692307692307692
0.0
1.0
0.8837209302325583
0.6666666666666667
0.0
0.6086956521739131
0.6666666666666666
0.0
0.5714285714285715
0.7384615384615385
0.0
0.4
0.0
0.8749999999999999
0.6538461538461539
0.0
0.0
0.6265060240963854
0.1111111111111111
0.7999999999999999
0.8042328042328041
0.8571428571428571
0.7027027027027029
0.6363636363636364
0.0
0.0
0.6666666666666666
0.

In [11]:
len(val_f1_scores_by_category)

2459

In [13]:
for feature in vectorizer_loaded.get_feature_names_out():
  print(feature)

[1;30;43mПоказано результат, скорочений до останніх рядків (5000).[0m
солнечного
солнечной
солнечном
солнечному
солнечную
солнечные
солнечный
солнечным
солнечными
солнечных
солнца
солнце
солнцем
солод
солода
соль
солью
соляная
соляной
сомнений
сомнения
сон
сонця
сонячних
сообщает
сообщение
сообщений
сообщения
сообщениях
сооружений
сооружения
соответственно
соответствие
соответствии
соответствия
соответствовать
соответствует
соответствуют
соответствующая
соответствующего
соответствующее
соответствующей
соответствующие
соответствующий
соответствующим
соответствующими
соответствующих
соответствующую
соотношение
соотношением
соотношении
соотношению
соотношения
соперников
сопла
сопло
соприкасается
соприкосновения
сопровождается
сопровождать
сопровождение
сопровождением
сопротивление
сопротивлением
сопротивления
сопротивляемость
сопряжение
сопряжения
сопутствующих
сорбат
соревнований
соревнования
соревнованиях
сорняков
сорт
сорта
сортировки
сортов
соска
соскальзывания
соскальзывать
соски
с

In [14]:
len(vectorizer_loaded.get_feature_names_out())

24945

In [18]:
type(val_f1_scores_by_category)
val_f1_scores_by_category_without_nulls = val_f1_scores_by_category[val_f1_scores_by_category != 0]
np.mean(val_f1_scores_by_category_without_nulls)

0.7034912375201466