# Загрузка нужных библиотек

In [None]:
!pip install scikit-learn==0.23.2



In [None]:
import pickle
import re

import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report

# Подготовка данных

In [None]:
# Загружаем исходные данные из паркет-файла и выполняем небольшую предобработку
def replace_re(text, regexp, inplace): 
    return regexp.sub(inplace, text)

data = pd.read_parquet("data/data_fusion_train.parquet", columns=['item_name', 'category_id'])
regex_punc = re.compile('[%s]'%re.escape('!?'))
data['item_name'] = data['item_name'].apply(lambda x: replace_re(x, regex_punc, ' '))
data['item_name'] = data['item_name'].apply(lambda x: x.strip())
regex_seq_spaces = re.compile(' {2,}')
data['item_name'] = data['item_name'].apply(lambda x: replace_re(x, regex_seq_spaces, ' '))
data['item_name'] = data['item_name'].apply(lambda x: x.lower())
data = data[data.category_id != -1].drop_duplicates('item_name')

In [None]:
# Загружаем данные с псевдо разметкой
# Предобработка не выполняется, т.к. item_name c псевдо разметкой уже предобработаны
df_pseudo1 = pd.read_csv("data/pseudo.csv", lineterminator='\n')
df_pseudo.columns = ['item_name',	'category_id']
df_pseudo = df_pseudo.dropna(subset=['category_id'])

In [None]:
data_total = pd.concat([data[['item_name',	'category_id']], df_pseudo[['item_name',	'category_id']]], ignore_index=True)
data_total = data_total.dropna(subset=['category_id']).drop_duplicates('item_name')
data_total.shape

# Кросс валидация модели

In [None]:
tfidf = TfidfVectorizer(max_features=400000, analyzer='char_wb', ngram_range=(2, 6), sublinear_tf=True)
tfidf.fit(data_total.item_name)
X_train = tfidf.transform(data_total.item_name)

In [None]:
max_count = data_total.category_id.value_counts().values[0]
class_w = (max_count / data_total.category_id.value_counts())**0.4
class_w

In [None]:
clf = LinearSVC(random_state=9, C=0.5, class_weight=class_w.to_dict())
scores = cross_val_score(clf, X_train, data_total.category_id, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
print(scores)
print('mean: ', np.mean(scores))
print('std: ', np.std(scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.7min finished


[0.87094666 0.90554257 0.96364821 0.97657373 0.98136195]
mean:  0.9396146210654601
std:  0.043732419172669054


LinearSVC(C=0.5,
          class_weight={0: 1.498715525612857, 1: 8.468849077048153,
                        2: 2.6179179733392917, 3: 7.548295590444579,
                        4: 5.523328588812379, 6: 11.275617281682273,
                        7: 6.6188402208575114, 9: 9.32900991578163,
                        11: 12.599981256166478, 12: 5.574750190630157,
                        13: 8.847972266014663, 19: 10.142815375005775,
                        20: 8.937274516112916, 24: 8.847972266014663,
                        26: 17.498016771742176, 27: 8.847972266014663,
                        29: 4.818011677497765, 30: 3.979348353002764,
                        31: 14.061914796550246, 35: 16.625774943101856,
                        36: 5.218174762168363, 37: 7.151728394616794,
                        38: 1.3717583354294762, 39: 10.771227850759322,
                        40: 3.148421693130888, 41: 9.706203837376517,
                        42: 7.704766057781605, 43: 4.137885719845835,
  

In [None]:
clf.fit(X_train, data_total.category_id)

# Анализ качества классификации

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['item_name']], data['category_id'],
    stratify=data['category_id'], test_size=0.2, random_state=0, shuffle=True)

X_train = pd.concat([X_train, df_pseudo[['item_name']]])
y_train = pd.concat([y_train, df_pseudo['category_id']])

max_count = y_train.category_id.value_counts().values[0]
class_w = (max_count / y_train.category_id.value_counts())**0.4

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

tfidf = TfidfVectorizer(max_features=400000, analyzer='char_wb', ngram_range=(2, 6), sublinear_tf=True)
tfidf.fit(X_train.item_name)
X_train = tfidf.transform(X_train.item_name)
X_test = tfidf.transform(X_test.item_name)

clf = LinearSVC(random_state=9, C=0.5, class_weight=class_w.to_dict())
clf.fit(X_train, y_train)
print(f'f1_score_train: {f1_score(y_train, clf.predict(X_train), average="weighted")}')
print(f'f1_score_test: {f1_score(y_test, clf.predict(X_test), average="weighted")}')


(145815, 1) (145815,)
(9579, 1) (9579,)
f1_score_train: 0.9932623300770694
f1_score_test: 0.8500957328412497


In [None]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       469
           1       1.00      1.00      1.00         6
           2       0.95      0.87      0.91        62
           3       0.95      0.86      0.90        22
           4       0.74      0.64      0.69        45
           6       0.82      0.69      0.75        13
           7       0.96      1.00      0.98        44
           9       0.91      0.95      0.93        21
          11       0.64      0.78      0.70         9
          12       0.81      0.81      0.81        36
          13       0.80      0.57      0.67         7
          19       0.86      0.40      0.55        15
          20       1.00      0.75      0.86        12
          24       0.67      0.46      0.55        13
          26       0.25      0.25      0.25         4
          27       0.78      0.88      0.82         8
          29       0.73      0.92      0.81        26
          30       0.76    

  _warn_prf(average, modifier, msg_start, len(result))


# Фомрмирование файлов для сабмита

In [None]:
!mkdir submit_dir

In [None]:
pickle.dump(tfidf, open('submit_dir/tfidf', 'wb'))
pickle.dump(clf, open('submit_dir/clf_task1', 'wb'))