In [None]:
import os.path

import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
import time
from datetime import datetime as dt
from string import punctuation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# TODO make as parameters
MAX_DF = 0.8
MIN_DF = 200
N_JOBS = -1
INPUT_DIR = "/content/drive/MyDrive/project_data/all_data_cats_with_gt50"
OUTPUT_DIR = "/content/drive/MyDrive/project_data/all_data_cats_with_gt50"


In [None]:
def read_dataset(input_dir, subset):
    """
    Reads and drops NAs
    :param input_dir: where the (train.csv, test.csv, valid.csv, ...) are located
    :param string subset: the name of the subset to read (train, test, valid, ...)
    :return: pd.DataFrame
    """
    df = pd.read_csv(os.path.join(input_dir, f"{subset}.csv"), header=0)
    initial_shape = df.shape[0]
    df = df.dropna(how="any")
    curr_shape = df.shape[0]
    print(f"There were {curr_shape - initial_shape} empty examples dropped.",
          f"{subset.upper()} set has {curr_shape} rows.")
    return df
(train, valid, test) = (read_dataset(INPUT_DIR, subset) for subset in ["train", "valid", "test"])

There were -35550 empty examples dropped. TRAIN set has 519202 rows.
There were -5878 empty examples dropped. VALID set has 86581 rows.
There were -5905 empty examples dropped. TEST set has 86554 rows.


In [None]:
class Timeit(object):
    def __init__(self, description):
        self.description = description

    def __enter__(self):
        self.start_time = time.time()

    def __exit__(self, type, value, traceback):
        ex_time = time.time() - self.start_time
        print(self.description, ex_time, "seconds")
        return True

In [None]:
logreg = Pipeline(steps=[("vectorizer", TfidfVectorizer(max_df=MAX_DF,
                                                            min_df=MIN_DF,
                                                            stop_words=russian_stopwords,
                                                            token_pattern=u'(?ui)\\b\\w*[а-я]+\\w*\\b')),
                         ("log_reg", LogisticRegression(n_jobs=N_JOBS,
                                                           solver="saga",
                                                           multi_class="multinomial",
                                                           random_state=100500))
                             ],
                  verbose=True)

# train the pipeline
logreg.fit(train["description"], train["targetcat"])

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total= 1.4min
[Pipeline] .......... (step 2 of 2) Processing log_reg, total=167.8min


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_df=0.8, min_df=200,
                                 stop_words=['и', 'в', 'во', 'не', 'что', 'он',
                                             'на', 'я', 'с', 'со', 'как', 'а',
                                             'то', 'все', 'она', 'так', 'его',
                                             'но', 'да', 'ты', 'к', 'у', 'же',
                                             'вы', 'за', 'бы', 'по', 'только',
                                             'ее', 'мне', ...],
                                 token_pattern='(?ui)\\b\\w*[а-я]+\\w*\\b')),
                ('log_reg',
                 LogisticRegression(multi_class='multinomial', n_jobs=-1,
                                    random_state=100500, solver='saga'))],
         verbose=True)

In [None]:
logreg.named_steps

{'log_reg': LogisticRegression(multi_class='multinomial', n_jobs=-1, random_state=100500,
                    solver='saga'),
 'vectorizer': TfidfVectorizer(max_df=0.8, min_df=200,
                 stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...],
                 token_pattern='(?ui)\\b\\w*[а-я]+\\w*\\b')}

In [None]:
suffix = 'test_run'
output_dir = os.path.join(OUTPUT_DIR, suffix)

vectorizer = logreg.named_steps["vectorizer"]
joblib.dump(vectorizer, os.path.join(output_dir, "tf_idf_vectorizer"), compress=3)

['/content/drive/MyDrive/project_data/all_data_cats_with_gt50/test_run/tf_idf_vectorizer']

In [None]:
log_reg = logreg.named_steps["log_reg"]
joblib.dump(log_reg, os.path.join(output_dir, "model_log_reg.joblib"), compress=3)

['/content/drive/MyDrive/project_data/all_data_cats_with_gt50/test_run/model_log_reg.joblib']

In [None]:
log_red_loaded = joblib.load('/content/drive/MyDrive/project_data/all_data_cats_with_gt50/test_run/model_log_reg.joblib')
vectorizer_loaded = joblib.load('/content/drive/MyDrive/project_data/all_data_cats_with_gt50/test_run/tf_idf_vectorizer')
logreg = Pipeline(steps=[("vectorizer", vectorizer_loaded),
                             ("log_reg", log_red_loaded)
                             ],
                      verbose=True)
# evaluation on validation
val_preds = logreg.predict(valid["description"])
val_f1_scores_by_category = f1_score(valid["targetcat"], val_preds, average=None)
val_f1_score_macro = np.mean(val_f1_scores_by_category)
print(f"F1 Macro on validation is: {val_f1_score_macro}")

F1 Macro on validation is: 0.6780332451862325


In [None]:
test = pd.DataFrame(zip(val_preds, val_f1_scores_by_category), columns = ['cat', 'score'])
len(test[test.score < 0.5])

334

In [None]:
test[test.score < 0.5]


Unnamed: 0,cat,score
4,3D и VR очки,0.125000
16,"Пены, герметики, клеи",0.000000
24,Ветпрепараты,0.000000
34,Парикмахерские тележки,0.434783
35,Комбинированные ключи,0.363636
...,...,...
1650,Бра и настенно-потолочные светильники,0.000000
1655,Ватные палочки,0.465517
1659,Косметика для бритья,0.000000
1660,Стартовый набор для маникюра,0.416667


In [None]:
len(val_f1_scores_by_category)

1671

In [None]:
val_f1_scores_by_category

array([0.84545455, 0.91304348, 0.97674419, ..., 0.83018868, 0.62686567,
       0.83443709])

In [None]:
len(vectorizer_loaded.get_feature_names_out())

24454

In [None]:
type(val_f1_scores_by_category)
val_f1_scores_by_category_without_nulls = val_f1_scores_by_category[val_f1_scores_by_category != 0]
np.mean(val_f1_scores_by_category_without_nulls)

0.717084527029237