In [93]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

In [214]:
with open('data/ranking_train.jsonl', 'r') as f:
    target = [json.loads(line) for line in f]

rows = []
for d in target:
    text = d['text']
    comments = [c['text'] for c in d['comments']]
    score = [c['score'] for c in d['comments']]
    rows.append({'text': text, 'comments': comments, 'scores':score})

df = pd.DataFrame(rows)
df = df.explode('comments')
unknown = []
for d in target:
    for c in d['comments']:
        unknown.append(c['score'])
df['scores'] = unknown
df.reset_index(drop=True, inplace = True)

In [215]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [216]:
df = df.sort_values('text').reset_index(drop=True)

Обучаем только на комментариях

In [217]:
X = df['comments']
Y = df['scores']

In [218]:
X.shape

(440535,)

Не перемешиваем, так как комментарии и так перемешаны в разрезе текста. Но в то же время берем тестовые тексты из центра распределения, как средние.

In [221]:
X_test = X[240000:340000] 

In [227]:
X_train = X[:240000].append(X[340000:], ignore_index=True)

  X_train = X[:240000].append(X[340000:], ignore_index=True)


In [232]:
Y_test = Y[240000:340000] 

In [233]:
Y_train = Y[:240000].append(Y[340000:], ignore_index=True)

  Y_train = Y[:240000].append(Y[340000:], ignore_index=True)


In [256]:
vectorizer = CountVectorizer(lowercase=True, token_pattern=r'\b\w+\b', stop_words={'english'})

In [257]:
X_train_vect = vectorizer.fit_transform(X_train)

In [258]:
X_test_vect = vectorizer.transform(X_test)

In [259]:
clf = MultinomialNB()
clf.fit(X_train_vect, Y_train)

In [260]:
y_pred = clf.predict(X_test_vect)

In [261]:
y_pred_ndcg = y_pred.reshape((-1,5))

In [262]:
Y_test_ndcg = Y_test.to_numpy().reshape((-1,5))

In [263]:
accuracy_score(Y_test, y_pred)

0.22984

In [264]:
from sklearn.metrics import ndcg_score
ndcg_score(Y_test_ndcg, y_pred_ndcg)

0.8339723724106887

С использованием TF-IDF:

In [245]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}
pipeline = Pipeline([
    ('vect', CountVectorizer(lowercase=True, token_pattern=r'\b\w+\b', stop_words={'english'})),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
clf.fit(X_train, Y_train)

In [247]:
y_pred = clf.predict(X_test)

In [248]:
y_pred_ndcg = y_pred.reshape((-1,5))

In [249]:
Y_test_ndcg = Y_test.to_numpy().reshape((-1,5))

In [250]:
accuracy_score(Y_test, y_pred)

0.23556

In [254]:
from sklearn.metrics import ndcg_score
ndcg_score(Y_test_ndcg, y_pred_ndcg)

0.8371380815822569