In [20]:
import torch
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel, BertForPreTraining, BertForMaskedLM
import torch.nn as nn
from tqdm import tqdm, tqdm_notebook
import os
import random
import re

RUBERT_PATH = '/Users/ts/Downloads/ru_conversational_cased_L-12_H-768_A-12_pt'
modelpath = os.path.join(RUBERT_PATH,'pytorch_model.bin')

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [21]:
#!pip install transformers #huggingface

In [22]:
os.path.isfile(os.path.join(RUBERT_PATH,'pytorch_model.bin'))

True

In [23]:
tokenizer = BertTokenizer.from_pretrained(RUBERT_PATH, do_lower_case=False)
config = BertConfig.from_json_file(os.path.join(RUBERT_PATH,'bert_config.json'))
model = BertForPreTraining.from_pretrained(modelpath, config=config)
model.eval()
from torch import load
di = load(modelpath)

In [24]:
def hw_bert(sentence): 
    tokenized_text = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    segments_ids[0] = 0
    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
 
    predictions = model(tokens_tensor, token_type_ids=segments_tensors)
    array_of_thensors = []

    for i in range(len(tokenized_text)):
        array_of_thensors.append((predictions[0][0][i].detach().numpy()))

    mean = np.mean(array_of_thensors, axis = 0)

    return mean

In [25]:
texts_df = pd.read_csv('texts_train.txt', sep="\t", header=None)
texts_df.columns = ["text"]
texts_df.head()

Unnamed: 0,text
0,"Сериал очень люблю, но Академия и Земля вызыва..."
1,"думал, что будет лучше идея очень интересна - ..."
2,с творчеством Головачева я познакомился посред...
3,"то-то я и в большое неудовольствие прочитал ""А..."
4,как мне показалось местами сильно смахивает на...


In [26]:
hw_bert(texts_df['text'][0])

array([-7.7523403, -7.0822864, -8.46571  , ..., -9.678036 , -9.688264 ,
       -9.683479 ], dtype=float32)

In [27]:
#scores data
if (os.path.isfile('collected_data.csv')):
    scores_df = pd.read_csv('collected_data.csv', dtype='float64')
else:
    scores_df = pd.read_csv('scores_train.txt', sep="\t", header=None, dtype='float64')
    scores_df.columns = ["tonality"]
    vector_means = [get_means(sentence) for sentence in texts_df["text"].tolist()]
    scores_df['vector_means'] = vector_means

scores_df.head()

Unnamed: 0,tonality,vector_means
0,6.0,-290.200745
1,7.0,-183.830475
2,10.0,-440.676788
3,5.0,-231.999832
4,6.0,-362.513824


In [28]:
scores_df.to_csv('collected_data.csv', index = False, header=True)

In [29]:
tone_levels = np.array(scores_df['tonality'])
features = np.array(scores_df['vector_means']).reshape(-1, 1)

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(features, tone_levels, test_size = 0.2, random_state = random.seed(SEED))

In [31]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [65]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test .shape)

Training Features Shape: (16000, 1)
Training Labels Shape: (16000,)
Testing Features Shape: (4000, 1)
Testing Labels Shape: (4000,)


In [43]:
def get_mae(pred, test):
    errors = np.mean(abs(y_test - y_pred))
    print('Mean Absolute Error:', errors)
    return errors

In [44]:
errors = get_mae(y_pred, y_test)

Mean Absolute Error: 2.29325


In [61]:
def get_accuracy(maerrs, test_data):
    mape = 100 * (maerrs / test_data)
    accuracy = 100 - np.mean(mape)
    print('Accuracy score - ', round(accuracy, 2), '%.')

In [62]:
print('##############Random Forest#################')
get_accuracy(errors, y_test)

##############Random Forest#################
Accuracy score -  63.94 %.


In [66]:
from sklearn import metrics
y_pred = [int(item) for item in y_pred]
print('classification_report: ')
print(metrics.classification_report(y_test, y_pred))

classification_report: 
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        80
         2.0       0.00      0.00      0.00        52
         3.0       0.00      0.00      0.00       117
         4.0       0.00      0.00      0.00       162
         5.0       0.00      0.00      0.00       271
         6.0       0.05      0.05      0.05       227
         7.0       0.08      0.46      0.14       334
         8.0       0.16      0.45      0.24       680
         9.0       0.00      0.00      0.00       967
        10.0       0.00      0.00      0.00      1110

    accuracy                           0.12      4000
   macro avg       0.03      0.10      0.04      4000
weighted avg       0.04      0.12      0.06      4000

