In [72]:
import langdetect
import sys
import json
from collections import defaultdict
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [5]:
!cat jsons/*.json | jq -c '.data | .comments[] | .commentdatajson | {mark,text,dignity,shortcomings}' | jq -s > everything.json

In [9]:
with open('everything.json') as f:
    content = json.load(f)

In [13]:
uk = []
for c in content:
    try:
        if langdetect.detect(c['text']) == 'uk':
            uk.append(c)
    except:
        pass

In [14]:
with open('uk.json', 'w') as f:
    json.dump(uk, f)

In [16]:
len(uk),len(content)

(5056, 24301)

In [74]:
df = pd.DataFrame(data=uk)
df = df[~df['text'].str.endswith('?') & ~df['mark'].isnull()]
df['text'] = df['text']+df['dignity']+df['shortcomings']
df.head()

Unnamed: 0,mark,text,dignity,shortcomings
0,0.0,"Хто вже користується, скажіть,будьласка, чи є ...",,
1,5.0,Апарат брав не тут. Смарт супер. Екран насичен...,"Батарея, екран, вигляд.","Датчик приближення, нахил селфі камери."
3,5.0,За цю вартість відмінний смартфон. Недоліків н...,"Ціна, екран, сенсор",На даний момент ще не виявлено
4,0.0,Скажіть будь ласка рамка телефону по кругу мет...,,
7,0.0,Тьфу-Тьфу-Тьфу............. .,,


In [75]:
df['mark'].value_counts()

5.0    1292
0.0    1286
4.0     415
1.0     146
3.0     143
2.0      80
Name: mark, dtype: int64

In [76]:
df['target'] = df['mark'].apply({5.0: 'pos', 4.0: 'meh', 3.0: 'meh', 2.0: 'neg', 1.0: 'neg', 0.0: 'meh'}.get)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.3, random_state=42)

In [78]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train.append(X_test))
train_features_vectorized = vectorizer.transform(X_train)
print("Total number of features: ", len(vectorizer.get_feature_names()))

Total number of features:  18593


In [79]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=42, solver="sag", multi_class="multinomial",
                         max_iter=1000, verbose=1)
lrc.fit(train_features_vectorized, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 4 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=1,
                   warm_start=False)

In [80]:
test_features_vectorized = vectorizer.transform(X_test)
test_features_vectorized

<1009x18593 sparse matrix of type '<class 'numpy.int64'>'
	with 30513 stored elements in Compressed Sparse Row format>

In [73]:
predicted = lrc.predict(test_features_vectorized)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

         meh       0.28      0.21      0.24       157
         neg       0.68      0.69      0.68       470
         pos       0.60      0.65      0.62       382

    accuracy                           0.60      1009
   macro avg       0.52      0.52      0.52      1009
weighted avg       0.59      0.60      0.59      1009

