In [1]:
import json
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import hamming_loss
from sklearn.base import clone
from sklearn.externals import joblib
from mif.tfidf import MifStemmer


with open('books.json', encoding='utf8') as f:
    books = pd.read_json(f)
    books.index = books['id']
    books = books[['title', 'text', 'category', 'tags']]
    books['category'].replace('Бизнес  ', 'Бизнес', inplace=True)
    books['category'] = books['category'].astype('category')

books.head(3)

Unnamed: 0_level_0,title,text,category,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15854,На пределе,На пределе\nНеделя без жалости к себе\n\n7-дне...,Cаморазвитие,"[Лидерство, Мечты и цели, Мотивация, Продуктив..."
16753,Магия утра,Магия утра\nКак первый час дня определяет ваш ...,Cаморазвитие,"[Мечты и цели, Мотивация, Энергия, Осознанност..."
14953,Scrum,Scrum\nРеволюционный метод управления проектам...,Бизнес,"[Мечты и цели, Тайм-менеджмент, Управление люд..."


# Features

In [2]:
vec = TfidfVectorizer(min_df=0.003, tokenizer=MifStemmer())


def get_X(text):
    data = vec.fit_transform(text)
    return pd.DataFrame(data.todense(), columns=vec.get_feature_names(), index=books.index)

    
X = get_X(books['text'])
X.head(5)

Unnamed: 0_level_0,agil,airlines,amazon,appl,award,bank,bbc,boeing,business,ce,...,японск,ярк,ярост,ярч,ясн,ясност,яхт,ячейк,ящериц,ящик
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16753,0.0,0.0,0.035329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.049492,0.0,0.0,0.0,0.0
14953,0.0,0.0,0.040166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.099042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.041891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train and predict

In [3]:
def model_train_cv_start(model, X, y, n_splits=10, proba=True, n_jobs=4):
    key = model.__class__.__name__ + '-' + str(n_splits) + '-' + str(proba)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=1234).split(X)
        
    results = []
    n = 0

    for fold in cv:
        n += 1
        nkey = key + '-' + str(n)
        
        cache_file = 'cache/' + nkey + '.pkl'
        try:
            result = joblib.load(cache_file)
        except FileNotFoundError:
            result = model_train_cv(fold, model, X, y, proba);
            joblib.dump(result, cache_file) 

        results.append(result)        
        
    return pd.concat(results)


def model_train_cv(fold, model, X, y, proba):
    model = clone(model)

    train, test = fold
    X_train, X_test = X.iloc[train, :], X.iloc[test, :]
    y_train, y_test = y.iloc[train], y.iloc[test]

    model.fit(X_train, y_train)
    if proba == False:
        df = pd.DataFrame(index=X_test.index)
        df['predict'] = model.predict(X_test).tolist()
        return df
    else:
        probas = model.predict_proba(X_test)
        return pd.DataFrame(probas, index=X_test.index)

# Category

In [4]:
y = books['category']
category_model = linear_model.LogisticRegression(C=100, class_weight='balanced', random_state=1234)
category_model.fit(X, y)
category_coef = pd.DataFrame(category_model.coef_, index=y.cat.categories, columns=X.columns)
category_coef.head(5)

Unnamed: 0,agil,airlines,amazon,appl,award,bank,bbc,boeing,business,ce,...,японск,ярк,ярост,ярч,ясн,ясност,яхт,ячейк,ящериц,ящик
Cаморазвитие,-0.099232,-0.210232,0.465629,-1.088295,-0.231085,-0.085692,-0.029635,-0.179828,0.087085,-1.446444,...,0.103992,-1.339578,0.388088,0.925427,1.181647,-0.134662,-0.257374,0.225736,0.527411,-0.45279
Бизнес,0.580078,0.080147,-1.215236,4.09841,0.17245,0.13658,-0.257114,-0.073274,1.469505,1.480205,...,0.900925,-0.555056,0.092269,-0.277934,0.615391,0.300077,-0.097826,0.349395,-0.198318,1.040701
Детские книги,-0.044993,-0.013142,-0.837667,-0.86556,0.481984,-0.068817,-0.070898,-0.435178,-0.207422,-0.197768,...,0.158707,0.268703,-0.127696,0.060587,-0.319488,-0.296147,-0.19004,-0.340739,0.438944,0.160904
Здоровый образ жизни,-0.190189,-0.044472,-0.302217,-0.392528,-0.057664,-0.04043,-0.086291,-0.118104,-0.109991,-0.173815,...,-0.365012,-0.229291,-0.184706,-0.225426,-0.194886,-0.493181,-0.07785,-0.051447,-0.338199,-0.049299
Маркетинг,-0.182592,0.290201,0.445814,0.467239,-0.081688,0.059466,-0.049565,-0.101757,-0.406755,0.498275,...,-0.508176,-0.509338,-0.059536,0.226271,-0.184505,-0.12999,-0.052626,-0.078393,-0.080074,-0.170574


In [5]:
predicts = model_train_cv_start(category_model, X, y, proba=False, n_splits=X.shape[0])
probas = model_train_cv_start(category_model, X, y, proba=True, n_splits=X.shape[0])
probas.columns = books['category'].cat.categories

category_predict = books[['title', 'category']].copy()
category_predict['predict'] = predicts['predict']
category_predict = category_predict.join(probas)

category_predict.head(5)

Unnamed: 0_level_0,title,category,predict,Cаморазвитие,Бизнес,Детские книги,Здоровый образ жизни,Маркетинг,Научпоп,Расширяющие кругозор,Творчество
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15854,На пределе,Cаморазвитие,Cаморазвитие,0.95573,0.016618,0.002994,0.013092,0.001948,0.000886,0.0067,0.002033
16753,Магия утра,Cаморазвитие,Cаморазвитие,0.961739,0.005448,0.000413,0.006449,0.001383,0.000397,0.004124,0.020045
14953,Scrum,Бизнес,Бизнес,0.017981,0.96424,0.002229,0.003099,0.001083,0.002621,0.00642,0.002326
16808,Между надо и хочу,Творчество,Творчество,0.02103,0.007404,0.000926,0.001195,0.001593,0.00066,0.006136,0.961055
15537,Без жалости к себе,Cаморазвитие,Cаморазвитие,0.947702,0.028485,0.001049,0.008349,0.001029,0.003096,0.008867,0.001422


In [6]:
print('Accuracy: {}'.format(accuracy_score(category_predict['category'], category_predict['predict'])))
print('Log Loss: {}'.format(log_loss(category_predict['category'], category_predict.ix[:, 'Cаморазвитие':])))

Accuracy: 0.8051771117166212
Log Loss: 0.5699728318439758


# Tags

In [7]:
tags = MultiLabelBinarizer()
y = pd.DataFrame(tags.fit_transform(books['tags']), index=X.index, columns=tags.classes_)

tags_model = OneVsRestClassifier(linear_model.LogisticRegression(C=1.0, class_weight='balanced', random_state=1234), n_jobs=4)
tags_model.fit(X, y.values)
tags_model_coef = pd.DataFrame(tags_model.coef_, index=tags.classes_, columns=X.columns)
tags_model_coef.head(5)

Unnamed: 0,agil,airlines,amazon,appl,award,bank,bbc,boeing,business,ce,...,японск,ярк,ярост,ярч,ясн,ясност,яхт,ячейк,ящериц,ящик
Agile,1.023907,-0.006674,0.056871,-0.051033,-0.006744,-0.004237,-0.006065,-0.007999,-0.027087,-0.032959,...,-0.023017,-0.099,-0.008346,-0.00769,-0.046792,-0.022927,-0.006595,-0.006956,-0.014088,-0.009649
Apple,-0.006282,-0.005403,-0.068665,0.96238,-0.005515,-0.004171,-0.004928,-0.008765,-0.02481,0.582852,...,-0.020532,0.120028,-0.014607,-0.007386,-0.038521,-0.017965,-0.006204,-0.012771,-0.013731,-0.008804
Email-маркетинг,-0.004473,-0.003719,-0.049191,-0.030186,-0.004086,-0.004729,-0.003142,-0.005262,-0.015756,-0.018373,...,-0.01492,-0.067849,-0.004824,-0.004951,-0.027977,-0.013221,-0.00463,-0.004159,-0.008962,-0.005912
HR,-0.031831,-0.033592,-0.018343,-0.144989,-0.015843,-0.012557,-0.017374,-0.027875,0.055149,-0.155766,...,-0.076154,-0.194251,-0.022358,-0.020772,0.489892,0.10421,-0.015696,-0.019246,-0.031973,-0.026602
Handmade,-0.006592,-0.006244,-0.091107,-0.053683,-0.007011,-0.005088,-0.006101,-0.008754,-0.028281,-0.033665,...,-0.022382,0.890508,-0.009657,-0.010678,-0.050016,-0.027592,-0.009471,-0.008205,-0.017379,-0.014699


In [8]:
predicts = model_train_cv_start(tags_model, X, y, proba=False, n_splits=X.shape[0])
probas = model_train_cv_start(tags_model, X, y, proba=True, n_splits=X.shape[0])

In [9]:
def convert_tags(tags):
    tags.sort()
    return ', '.join(tags)


tags_predict = books[['title', 'tags']].copy()
tags_predict['tags'] = tags_predict['tags'].apply(convert_tags)
tags_predict['predict'] = predicts['predict'].apply(lambda x: convert_tags(list(tags.inverse_transform(np.array([x]))[0])))

probas.columns = tags.classes_
tags_predict = tags_predict.join(probas)

tags_predict.head(5)

Unnamed: 0_level_0,title,tags,predict,Agile,Apple,Email-маркетинг,HR,Handmade,PR,SMM,...,Фитнес,Фондовый рынок,Фотография,Художественные,Ценообразование,Чевостик,Школьникам,Экономика,Эмоциональный интеллект,Энергия
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15854,На пределе,"Лидерство, Мечты и цели, Мотивация, Продуктивн...","Лидерство, Мечты и цели, Мотивация, Подарочные...",0.041943,0.038147,0.029559,0.133578,0.04611,0.0264,0.047265,...,0.148124,0.052631,0.058232,0.051235,0.029854,0.045684,0.135518,0.098341,0.115681,0.2954
16753,Магия утра,"Гармония, Мечты и цели, Мотивация, Осознанност...","Книги в подарок друзьям, Мечты и цели, Мотивац...",0.038627,0.036719,0.031952,0.101192,0.064383,0.024582,0.042325,...,0.084331,0.043618,0.048034,0.034871,0.025513,0.030724,0.093681,0.06699,0.201382,0.389901
14953,Scrum,"Agile, Бизнес-процессы, Мечты и цели, Руководи...","Agile, Бизнес-процессы, Настольные книги профи...",0.759239,0.041107,0.028174,0.205623,0.049613,0.032082,0.083852,...,0.076091,0.06266,0.053782,0.050297,0.038241,0.043311,0.131849,0.124836,0.075668,0.191346
16808,Между надо и хочу,"Вдохновение, Гармония, Друзья и окружение, Кни...","Вдохновение, Гармония, Друзья и окружение, Кни...",0.032474,0.035678,0.02434,0.110746,0.059548,0.027929,0.039823,...,0.053922,0.036339,0.058013,0.037063,0.025484,0.03956,0.143888,0.072851,0.15882,0.218131
15537,Без жалости к себе,"Книги в подарок друзьям, Книги в подарок колле...","Мечты и цели, Мотивация, Развивай себя сам",0.044671,0.04244,0.031662,0.152324,0.050451,0.032996,0.040901,...,0.091582,0.05394,0.065519,0.050466,0.030175,0.039428,0.122829,0.113785,0.112121,0.208545


In [10]:
y_true = tags.transform(tags_predict['tags'].apply(lambda s: s.split(', ')))
y_pred = tags.transform(tags_predict['predict'].apply(lambda s: s.split(', ') if s != '' else []))

print('Hamming Loss: {0}'.format(hamming_loss(y_true, y_pred)))

Hamming Loss: 0.025978775276064823


# Export

In [11]:
models = {
    'transform': {
        'words': vec,
        'tags': tags
    },
    'category': {
        'model': category_model,
        'predict': category_predict,
        'coef': category_coef
    },
    'tags': {
        'model': tags_model,
        'predict': tags_predict,
        'coef': tags_model_coef
    }
}

joblib.dump(models, 'models.pkl')

['models.pkl']