In [190]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import random
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import nltk
import pymorphy2
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [32]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [33]:
morph = pymorphy2.MorphAnalyzer()

In [335]:
with open('data/ranking_train.jsonl', 'r') as f:
    target = [json.loads(line) for line in f]

rows = []
for d in target:
    text = d['text']
    comments = [c['text'] for c in d['comments']]
    score = [c['score'] for c in d['comments']]
    rows.append({'text': text, 'comments': comments, 'scores':score})

df = pd.DataFrame(rows)
df = df.explode('comments')
unknown = []
for d in target:
    for c in d['comments']:
        unknown.append(c['score'])
df['scores'] = unknown
df.reset_index(drop=True, inplace = True)

In [355]:
batch_1 = df[:1000]

In [262]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Комменты

In [194]:
batch_1['comments'] = batch_1['comments'].replace("[0-9!#()$\,\'\-\.*+/:;<=>?@[\]^_`{|}\"]+", ' ', regex=True)
batch_1['comments'] = batch_1['comments'].replace(r'\s+', ' ', regex=True)
batch_1['comments'] = batch_1['comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
batch_1['comments'] = batch_1['comments'].apply(lambda x: ' '.join([morph.parse(word)[0]. normal_form for word in x.split()]))

In [195]:
tokenized = batch_1['comments'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation= True)))

In [196]:
padded = np.array([i + [0]*(512-len(i)) for i in tokenized.values])

In [197]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1000, 512)

In [198]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [199]:
features = last_hidden_states[0][:,0,:].numpy()

In [356]:
features = np.load('com_features_vec.npy')

In [357]:
labels = batch_1['scores']

In [414]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, shaffle=False)

In [402]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
parameters = {'n_neighbors': np.arange(1, 200), 'weights': ['uniform', 'distance'], 
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, cv=5)
clf.fit(train_features, train_labels)

In [403]:
clf.score(test_features, test_labels)

0.316

In [397]:
clf_ndcg = clf.predict(test_features).reshape((-1,5))

In [400]:
from sklearn.metrics import ndcg_score
ndcg_score(test_labels.to_numpy().reshape((-1,5)), clf_ndcg)

0.8680642549515073

In [415]:
from sklearn.ensemble import RandomForestClassifier
tree = RandomForestClassifier()
tree.fit(train_features, train_labels)

In [416]:
tree.score(test_features, test_labels)

0.28

In [406]:
tree_ndcg = clf.predict(test_features).reshape((-1,5))

In [408]:
from sklearn.metrics import ndcg_score
ndcg_score(test_labels.to_numpy().reshape((-1,5)), tree_ndcg)

0.8721337176903627

In [417]:
classifier_log = LogisticRegression(max_iter=10000)

classifier_log.fit(train_features, train_labels)

In [418]:
classifier_log.score(test_features, test_labels)

0.24

In [412]:
lg_ndcg = classifier_log.predict(test_features).reshape((-1,5))

In [413]:
from sklearn.metrics import ndcg_score
ndcg_score(test_labels.to_numpy().reshape((-1,5)), lg_ndcg)

0.8378078898467355