In [2]:
import json
import nltk
#nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [3]:
n = 10000 # кол-во обрабатываемых отзывов
all_texts = [] # выделяем текст отзывов
all_overalls = np.zeros((n, 1)) #выделяем оценки
# считываем данные из файла
j = 0
with open('Beauty_5.json', 'r') as f:
    for line in f:
        if j == n:
            break
        s = json.loads(line)
        all_texts.append(s["reviewText"])
        all_overalls[j] = s["overall"]
        j = j + 1

In [4]:
# бинаризуем оценки
binarized_overalls = (all_overalls > 3).astype(int) 

In [5]:
# векторизация необработанных текстов
# TF-IDF

tfidf_vectorizer = TfidfVectorizer()
untreated_words = tfidf_vectorizer.fit_transform(all_texts)

# Show as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
df1 = pd.DataFrame(untreated_words.toarray(), columns = feature_names)

In [6]:
df1

Unnamed: 0,00,000,01,03,05,06,0639v,07,08,09,...,zits,zo,zoloft,zone,zones,zoya,zoyas,zumba,zuz,zuzu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Цели предобработки: сделать набор слов удобным для изучения, таким, чтобы машине было проще понять, как сопоставить слова и оценки. Для этого нужно выбрать не только самые важные по смыслу слова, но и представить их в такой форме, чтобы сократить расход памяти и время анализа.\
Что мы для этого можем сделать?\
Во-первых, разбить все на слова, а дальше работать с ними.\
1) привести все буквы к нижнему регистру (понятно, что регистр нам не важен)\
2) удалить знаки пунктуации (они не дают никакой информации, для нас это просто мусор)\
3) удалить числа и слова, содержащие цифры (могут относиться к чему угодно не по теме, думаю ими можно пренебречь)\
4) удалить слова с нижним подчеркиванием (их сложно свести к обычным и они встречаются редко)\
5) удалить стоп-слова\
6) удалить слова, которые встречаются слишком редко или слишком часто (первые будут только увеличивать размер таблиц, не принося особой пользы, а вторые обычно не несут смысловой нагрузки)\
7) провести стемминг (различные формы слова не так значительны как его основа). \ Сначала я хотела проводить лемматизацию, но она оставляет такие слова как "worked", "working", "works", и я посчитала, что такие формы можно игнорировать.
Пункты 1,5-7 можно объединить с tf-idf векторайзером.\
*Еще я заметила там слова с большим количеством повторяющихся букв, но не знаю что с ними сделать. По идее они тоже мешаются.

In [7]:
words = [] # массив массивов слов из каждого предложения после предобработки
for i in range(len(all_texts)):
    wordLists = re.sub(r"_", "", all_texts[i]) # удаление нижнего подчёркивания
    wordList = re.sub("[^\w]", " ",  wordLists).split() # удаление пунктуации
    # удаление слов, содержащих цифры
    for j in range(len(wordList)):
        for k in range(len(wordList[j])):
            if wordList[j][k].isdigit():
                wordList[j] = ''
                break
    for j in range(len(wordList)):
        #wordList[j] = WordNetLemmatizer().lemmatize(wordList[j], wordnet.VERB) # лемматизация
        wordList[j] = PorterStemmer().stem(wordList[j]) # стемминг
    words.append(wordList)

In [27]:
# создадим массив предложений обработанных слов
sentences = []
for i in range(len(words)):
    s = ''
    for j in range(len(words[i])):
        s = s + words[i][j] + ' '
    sentences.append(s)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 0.01, max_df = 0.99)
treated_words = tfidf_vectorizer.fit_transform(sentences)

# Show as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
df2 = pd.DataFrame(treated_words.toarray(), columns = feature_names)
#df2["class"] = binarized_overalls

In [28]:
df2

Unnamed: 0,abl,absolut,absorb,acid,acn,actual,ad,add,addit,afford,...,wonder,work,worri,wors,worth,wouldn,wrinkl,wrong,ye,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200441,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# разобьем полученные выборки на обучающие и тестовые 
# по идее надо с помощью кросс-валидации типа KFold, но я не знаю как это сделать
# поэтому использовала просто train_test_split и обычное такое соотношение 3/7
X = df2.drop
y = binarized_overalls
from sklearn.model_selection import train_test_split
(X_train, 
 X_test, 
 y_train, y_test) = train_test_split(X, y, 
                                     test_size=0.3, 
                                     random_state=0)

# будем использовать модель логистическую регрессию
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# параметры подберем с помощью грид серча
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10]}

# cv определяет стратегию кросс-валидации, по идее это значит что тут 3 фолда
optimizer = GridSearchCV(model, param_grid, cv = 3)

optimizer.fit(X_train, y_train.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.05, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5,
                               10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
#средняя точность
print(optimizer.best_score_)

0.8294286868989609


In [21]:
# проверка на тесте
model = optimizer.best_estimator_
prediction = model.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, prediction)
print(score)

0.8476666666666667


In [22]:
# теперь сделаем то же самое для необработанных слов
X = df1
y = binarized_overalls
from sklearn.model_selection import train_test_split
(X_train, 
 X_test, 
 y_train, y_test) = train_test_split(X, y, 
                                     test_size=0.3, 
                                     random_state=0)

# будем использовать модель логистическую регрессию
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# параметры подберем с помощью грид серча
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10]}

# cv определяет стратегию кросс-валидации, по идее это значит что тут 3 фолда
optimizer = GridSearchCV(model, param_grid, cv = 3)

optimizer.fit(X_train, y_train.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.05, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5,
                               10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [23]:
# средняя точность
print(optimizer.best_score_)

0.8427156872575626


In [24]:
# проверка на тесте
model = optimizer.best_estimator_
prediction = model.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, prediction)
print(score)

0.8623333333333333
