In [None]:
import dask.bag as db
import json

folder = 'CL_Cup IT_Data_Scince_секция_кейс_VK_датасет.zip'
test_data = db.read_text(
    'zip://ranking_test.jsonl',
    storage_options={'fo': folder},
    encoding='Windows-1251'
).map(json.loads)
train_data = db.read_text(
    'zip://ranking_train.jsonl',
    storage_options={'fo': folder},
    encoding='Windows-1251'
).map(json.loads)

In [None]:
test_df = test_data.to_dataframe().compute()
train_df = train_data.to_dataframe().compute()

In [None]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
train = train_df.explode(column='comments')
train['comment'] = train.comments.map(lambda dic: dic['text'])
train['score'] = train.comments.map(lambda dic: dic['score'])
train.drop(labels=['comments'], axis=1, inplace=True)
train.head()

Unnamed: 0,text,comment,score
0,How many summer Y Combinator fundees decided n...,Going back to school is not identical with giv...,0
0,How many summer Y Combinator fundees decided n...,There will invariably be those who don't see t...,1
0,How many summer Y Combinator fundees decided n...,For me school is a way to be connected to what...,2
0,How many summer Y Combinator fundees decided n...,I guess it really depends on how hungry you ar...,3
0,How many summer Y Combinator fundees decided n...,I know pollground decided to go back to school...,4


In [None]:
# получение стемматизированных комментов и постов
stemmed_posts = pd.read_csv('./stemmed_posts.csv', index_col=0)
stemmed_comments = pd.read_csv('./stemmed_comments.csv', index_col=0)

stemmed_posts['text'] = stemmed_posts['text'].str.replace("'",'') \
.str.rstrip(']').str.lstrip('[').str.replace(' ','').str.split(',')

stemmed_comments['comment'] = stemmed_comments['comment'] \
.str.replace("'",'').str.rstrip(']').str.lstrip('[').str.replace(' ','') \
.str.split(',')

In [None]:
# получение лемматизированных комментов и постов
lemmatized_posts = pd.read_csv('./posts_lemmatized.csv')
lemmatized_comments = pd.read_csv('./comments_lemmatized.csv')

lemmatized_posts['text'] = lemmatized_posts['text'].str.replace("'",'') \
.str.rstrip(']').str.lstrip('[').str.replace(' ','').str.split(',')

lemmatized_comments['comment'] = lemmatized_comments['comment'] \
.str.replace("'",'').str.rstrip(']').str.lstrip('[').str.replace(' ','') \
.str.split(',')

In [None]:
# формирование столбца соединенного из лемматизированных названия и коммента
# ресет индекса для более удобной индексации на этапе формирования массивов X и Y
train['full_text'] = lemmatized_posts['text'].str.join(' ') + ' ||| ' + lemmatized_comments['comment'].str.join(' ')
train.reset_index(inplace=True)

In [None]:
# формирование столбца соединенного из стемматизированных названия и коммента
train['full_text_stem'] = stemmed_posts['text'].str.join(' ') + ' ||| ' + stemmed_comments['comment'].str.join(' ')

In [None]:
%%file svc_mp.py
from sklearn.svm import SVC

def perform_svc(xy_zipped):
    model = SVC(kernel = 'linear', probability = True)
    model.fit(xy_zipped[0], xy_zipped[1]) 
    y_pred_train = model.predict(xy_zipped[0])
    return y_pred_train, xy_zipped[1]

Overwriting svc_mp.py


In [None]:
%%file logreg_mp.py
from sklearn.linear_model import LogisticRegression

def perform_logreg(xy_zipped):
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(xy_zipped[0], xy_zipped[1]) 
    y_pred_train = model.predict(xy_zipped[0])
    return y_pred_train, xy_zipped[1]

Writing logreg_mp.py


In [None]:
%%file xgb_mp.py
import xgboost as xgb

def perform_xgb(xy_zipped):
    model = xgb.XGBClassifier(
        n_estimators=1000, max_depth=8, 
        eta=0.5
    )
    model.fit(xy_zipped[0], xy_zipped[1]) 
    y_pred_train = model.predict(xy_zipped[0])
    return y_pred_train, xy_zipped[1]

Overwriting xgb_mp.py


In [None]:
%%file tree_mp.py
from sklearn.tree import DecisionTreeClassifier

def perform_tree(xy_zipped):
    model = DecisionTreeClassifier(
        criterion='gini', max_depth=8, 
        min_samples_leaf=5,
    )
    model.fit(xy_zipped[0], xy_zipped[1]) 
    y_pred_train = model.predict(xy_zipped[0])
    return y_pred_train, xy_zipped[1]

Writing tree_mp.py


In [None]:
# формирование семплов из X (fulltext_tfidfs) и Y (scores_for_model)
posts_and_comments = [train['full_text'].sample(12000) for _ in range(8)]
indices = list(map(lambda pandc: pandc.index, posts_and_comments))
fulltext_tfidfs = list(
    map(lambda pandc: scipy.sparse.csr_matrix(TfidfVectorizer(decode_error='ignore', analyzer='word').fit_transform(pandc)), posts_and_comments)
)
scores_for_model = [train['score'][idx] for idx in indices]

In [None]:
# формирование семплов из X (fulltextstem_tfidfs) на основе предыдущих индексов
posts_and_comments_stemmed = [train['full_text_stem'][idx] for idx in indices]
fulltextstem_tfidfs = list(
    map(lambda pandc: scipy.sparse.csr_matrix(TfidfVectorizer(decode_error='ignore', analyzer='word').fit_transform(pandc)), posts_and_comments_stemmed)
)

In [None]:
%%time

# SVC
import multiprocessing
from svc_mp import perform_svc

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_svc = pool.map(perform_svc, list(zip(fulltext_tfidfs, scores_for_model)))

CPU times: total: 141 ms
Wall time: 8min 44s


In [None]:
%%time

# LogisticRegression
import multiprocessing
from logreg_mp import perform_logreg

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_logreg = pool.map(perform_logreg, list(zip(fulltext_tfidfs, scores_for_model)))

CPU times: total: 93.8 ms
Wall time: 8.44 s


In [None]:
%%time

# XGBClassifier
import multiprocessing
from xgb_mp import perform_xgb

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_xgb = pool.map(perform_xgb, list(zip(fulltext_tfidfs, scores_for_model)))

CPU times: total: 93.8 ms
Wall time: 14min 23s


In [None]:
%%time

# TreeClassifier
import multiprocessing
from tree_mp import perform_tree

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_tree = pool.map(perform_tree, list(zip(fulltext_tfidfs, scores_for_model)))

CPU times: total: 141 ms
Wall time: 2.47 s


In [None]:
%%time

# SVC для стемматизированных
import multiprocessing
from svc_mp import perform_svc

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_svc_for_stemmed = pool.map(perform_svc, list(zip(fulltextstem_tfidfs, scores_for_model)))

CPU times: total: 156 ms
Wall time: 7min 47s


In [None]:
%%time

# LogisticRegression для стематизированных
import multiprocessing
from logreg_mp import perform_logreg

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_logreg_for_stemmed = pool.map(perform_logreg, list(zip(fulltextstem_tfidfs, scores_for_model)))

CPU times: total: 109 ms
Wall time: 9.06 s


In [None]:
%%time

# XGBClassifier для стемматизированных
import multiprocessing
from xgb_mp import perform_xgb

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_xgb_for_stemmed = pool.map(perform_xgb, list(zip(fulltextstem_tfidfs, scores_for_model)))

CPU times: total: 141 ms
Wall time: 12min 20s


In [None]:
%%time

# TreeClassifier для стемматизированных
import multiprocessing
from tree_mp import perform_tree

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    res_tree_for_stemmed = pool.map(perform_tree, list(zip(fulltextstem_tfidfs, scores_for_model)))

CPU times: total: 125 ms
Wall time: 2.6 s


In [None]:
from sklearn.metrics import ndcg_score, f1_score
from scipy import stats

def model_pivot_table(mp_results):
    metrics_pivottable = pd.DataFrame(
        zip(
            map(
                lambda pred_true: f1_score(pred_true[1], pred_true[0], average='weighted'), 
                mp_results
            ),
            map(
                lambda pred_true: ndcg_score([pred_true[1]],[pred_true[0]]), 
                mp_results
            ),
        ),
        columns=['f1 score', 'NDCG score']
    )
    statistics = pd.DataFrame(
        [stats.hmean(metrics_pivottable).T, metrics_pivottable.std().values.T], 
        columns=['f1 score', 'NDCG score']
    )
    metrics_pivottable = pd.DataFrame(pd.concat([metrics_pivottable, statistics]))
    metrics_pivottable.index = pd.Index([f'Сэмпл {i+1}' for i in range(8)] + ['среднегармонич.', 'станд. отклон.'])
    return metrics_pivottable

In [None]:
# сводная таблица для SVC
pd.concat([
    model_pivot_table(res_svc),
    model_pivot_table(res_svc_for_stemmed)
], axis=1)

Unnamed: 0,f1 score,NDCG score,f1 score.1,NDCG score.1
Сэмпл 1,0.746992,0.976348,0.759408,0.978275
Сэмпл 2,0.747972,0.976505,0.753679,0.977408
Сэмпл 3,0.744444,0.974748,0.761177,0.977585
Сэмпл 4,0.748595,0.976165,0.759982,0.977768
Сэмпл 5,0.74977,0.975759,0.75969,0.975461
Сэмпл 6,0.750069,0.975141,0.765203,0.977979
Сэмпл 7,0.740322,0.974562,0.763951,0.978386
Сэмпл 8,0.746692,0.976128,0.760636,0.977539
среднегармонич.,0.746845,0.975669,0.760452,0.97755
станд. отклон.,0.003199,0.000754,0.003446,0.000914


In [None]:
# сводная таблица для LogisticRegression
pd.concat([
    model_pivot_table(res_logreg),
    model_pivot_table(res_logreg_for_stemmed)
], axis=1)

Unnamed: 0,f1 score,NDCG score,f1 score.1,NDCG score.1
Сэмпл 1,0.772179,0.978343,0.761955,0.976718
Сэмпл 2,0.776082,0.979185,0.761208,0.976862
Сэмпл 3,0.770046,0.976987,0.768592,0.976975
Сэмпл 4,0.773532,0.978169,0.766696,0.97607
Сэмпл 5,0.773169,0.977747,0.765316,0.974743
Сэмпл 6,0.774824,0.978171,0.765421,0.975884
Сэмпл 7,0.773147,0.977378,0.765883,0.976874
Сэмпл 8,0.772832,0.978143,0.765422,0.976159
среднегармонич.,0.773223,0.978015,0.765055,0.976285
станд. отклон.,0.00178,0.000663,0.002409,0.000751


In [None]:
# сводная таблица для XGBClassifier
pd.concat([
    model_pivot_table(res_xgb),
    model_pivot_table(res_xgb_for_stemmed)
], axis=1)

Unnamed: 0,f1 score,NDCG score,f1 score.1,NDCG score.1
Сэмпл 1,0.947581,0.995174,0.999667,0.999929
Сэмпл 2,0.94643,0.99451,1.0,1.0
Сэмпл 3,0.944729,0.994445,0.999583,0.999905
Сэмпл 4,0.94767,0.994799,0.999917,0.999992
Сэмпл 5,0.950585,0.995004,0.999917,0.999955
Сэмпл 6,0.946918,0.994736,1.0,1.0
Сэмпл 7,0.947588,0.995322,0.99975,0.99998
Сэмпл 8,0.947169,0.995222,1.0,1.0
среднегармонич.,0.947331,0.994901,0.999854,0.99997
станд. отклон.,0.001631,0.00033,0.000165,3.7e-05


In [None]:
# сводная таблица для DecisionTreeClassifier
pd.concat([
    model_pivot_table(res_tree),
    model_pivot_table(res_tree_for_stemmed)
], axis=1)

Unnamed: 0,f1 score,NDCG score,f1 score.1,NDCG score.1
Сэмпл 1,0.075993,0.926014,0.239987,0.938084
Сэмпл 2,0.099862,0.928198,0.229866,0.935087
Сэмпл 3,0.080512,0.927673,0.226904,0.932296
Сэмпл 4,0.085855,0.92339,0.192769,0.93138
Сэмпл 5,0.081137,0.922561,0.213044,0.93159
Сэмпл 6,0.086369,0.922849,0.204642,0.933182
Сэмпл 7,0.082978,0.929281,0.192358,0.930941
Сэмпл 8,0.108727,0.927863,0.233476,0.932472
среднегармонич.,0.086587,0.925972,0.215204,0.933124
станд. отклон.,0.011015,0.002684,0.018613,0.002382
