### Импортим необходимые инструменты

In [2]:
import json
import nltk
#nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

### Считываем данные

In [3]:
n = 10000 # кол-во обрабатываемых отзывов
all_texts = [] # выделяем текст отзывов
all_overalls = np.zeros((n, 1)) #выделяем оценки
# считываем данные из файла
j = 0
with open('Beauty_5.json', 'r') as f:
    for line in f:
        if j == n:
            break
        s = json.loads(line)
        all_texts.append(s["reviewText"])
        all_overalls[j] = s["overall"]
        j = j + 1

In [4]:
# бинаризуем оценки
binarized_overalls = (all_overalls > 3).astype(int) 

### Векторизация необработанных текстов

In [5]:
# векторизация необработанных текстов
# TF-IDF

tfidf_vectorizer = TfidfVectorizer()
untreated_words = tfidf_vectorizer.fit_transform(all_texts)

# Show as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
df1 = pd.DataFrame(untreated_words.toarray(), columns = feature_names)

In [6]:
df1

Unnamed: 0,00,000,01,03,05,06,0639v,07,08,09,...,zits,zo,zoloft,zone,zones,zoya,zoyas,zumba,zuz,zuzu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Предобработка текстов

Цели предобработки: сделать набор слов удобным для изучения, таким, чтобы машине было проще понять, как сопоставить слова и оценки. Для этого нужно выбрать не только самые важные по смыслу слова, но и представить их в такой форме, чтобы сократить расход памяти и время анализа.\
Что мы для этого можем сделать?\
Во-первых, разбить все на слова, а дальше работать с ними.\
1) привести все буквы к нижнему регистру (понятно, что регистр нам не важен)\
2) удалить знаки пунктуации (они не дают никакой информации, для нас это просто мусор)\
3) удалить числа и слова, содержащие цифры (могут относиться к чему угодно не по теме, думаю ими можно пренебречь)\
4) удалить слова с нижним подчеркиванием (их сложно свести к обычным и они встречаются редко)\
5) удалить стоп-слова\
6) удалить слова, которые встречаются слишком редко или слишком часто (первые будут только увеличивать размер таблиц, не принося особой пользы, а вторые обычно не несут смысловой нагрузки)\
7) провести стемминг (различные формы слова не так значительны как его основа). \ Сначала я хотела проводить лемматизацию, но она оставляет такие слова как "worked", "working", "works", и я посчитала, что такие формы можно игнорировать.
Пункты 1,5-7 можно объединить с tf-idf векторайзером.\
*Еще я заметила там слова с большим количеством повторяющихся букв, но не знаю что с ними сделать. По идее они тоже мешаются.

In [7]:
words = [] # массив массивов слов из каждого предложения после предобработки
for i in range(len(all_texts)):
    wordLists = re.sub(r"_", "", all_texts[i]) # удаление нижнего подчёркивания
    wordList = re.sub("[^\w]", " ",  wordLists).split() # удаление пунктуации
    # удаление слов, содержащих цифры
    for j in range(len(wordList)):
        for k in range(len(wordList[j])):
            if wordList[j][k].isdigit():
                wordList[j] = ''
                break
    for j in range(len(wordList)):
        #wordList[j] = WordNetLemmatizer().lemmatize(wordList[j], wordnet.VERB) # лемматизация
        wordList[j] = PorterStemmer().stem(wordList[j]) # стемминг
    words.append(wordList)

### Векторизация обработанных текстов

In [8]:
# создадим массив предложений обработанных слов
sentences = []
for i in range(len(words)):
    s = ''
    for j in range(len(words[i])):
        s = s + words[i][j] + ' '
    sentences.append(s)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 0.01, max_df = 0.99)
treated_words = tfidf_vectorizer.fit_transform(sentences)

# Show as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
df2 = pd.DataFrame(treated_words.toarray(), columns = feature_names)
#df2["class"] = binarized_overalls

In [9]:
df2

Unnamed: 0,abl,absolut,absorb,acid,acn,actual,ad,add,addit,afford,...,wonder,work,worri,wors,worth,wouldn,wrinkl,wrong,ye,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200441,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Разбиение на выборки

Разобьем полученные выборки на обучающие и тестовые в соотношении 7:3.
Затем обе обучающие (для обработанных и для необработанных текстов) разобьем на 5 примерно равных частей (фолдов). Далее будем проводить 5-fold кросс-валидацию.

In [10]:
# необработанные
untreated_words_train = untreated_words[:7000]
untreated_words_test = untreated_words[7000:]

# обработанные
treated_words_train = treated_words[:7000]
treated_words_test = treated_words[7000:]

# оценки
overalls_train = binarized_overalls[:7000]
overalls_test = binarized_overalls[7000:]

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(untreated_words_train):
    print("TRAIN_1:", train_index, "TEST_1:", test_index)
    
for train_index, test_index in kf.split(treated_words_train):
    print("TRAIN_2:", train_index, "TEST_2:", test_index)

TRAIN_1: [1400 1401 1402 ... 6997 6998 6999] TEST_1: [   0    1    2 ... 1397 1398 1399]
TRAIN_1: [   0    1    2 ... 6997 6998 6999] TEST_1: [1400 1401 1402 ... 2797 2798 2799]
TRAIN_1: [   0    1    2 ... 6997 6998 6999] TEST_1: [2800 2801 2802 ... 4197 4198 4199]
TRAIN_1: [   0    1    2 ... 6997 6998 6999] TEST_1: [4200 4201 4202 ... 5597 5598 5599]
TRAIN_1: [   0    1    2 ... 5597 5598 5599] TEST_1: [5600 5601 5602 ... 6997 6998 6999]
TRAIN_2: [1400 1401 1402 ... 6997 6998 6999] TEST_2: [   0    1    2 ... 1397 1398 1399]
TRAIN_2: [   0    1    2 ... 6997 6998 6999] TEST_2: [1400 1401 1402 ... 2797 2798 2799]
TRAIN_2: [   0    1    2 ... 6997 6998 6999] TEST_2: [2800 2801 2802 ... 4197 4198 4199]
TRAIN_2: [   0    1    2 ... 6997 6998 6999] TEST_2: [4200 4201 4202 ... 5597 5598 5599]
TRAIN_2: [   0    1    2 ... 5597 5598 5599] TEST_2: [5600 5601 5602 ... 6997 6998 6999]


### Описание моделей

Создадим набор параметров моделей, которые мы будем тестировать. Нужные параметры: \
1. Penalty: none/l2 - регуляризация
2. C: 0.1, 0.5, 1, 1.5, 2, 5 - коэффициент регуляризации
3. Class_weight: none/balanced - веса классов
4. Solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’} - оптимизатор

А также создадим два массива для оценок с кросс-валидации - scores и scores2 (68 * 7) - первые 5 столбцов - сами оценки, 6-й - мат. ожидание, 7-й - дисперсия.

Как получается 68: мы перемножаем все наборы параметров, 2*6*2*5=120, вычетаем варианты, где для регуляризации 'none' перебираются коэффициенты, это с 10 по 60. При выполнение программы выяснилось, что 'liblinear' не сочетается с Penalty 'none', поэтому убираем все такие сочетания. Остается 68.

In [33]:
pd.options.display.max_rows = 120
p = [
    ['none', 'l2'],
    [0.1, 0.5, 1, 1.5, 2, 5],
    ['', 'balanced'],
    ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
]

params = []
import itertools
for element in itertools.product(*p):
    params.append(element)

for i in range(50):  # по-другому у меня не получилось удалить все что с none и разными коэф-ми
    params.pop(10)
    
params.pop(7)
params.pop(2)

    
model_params = pd.DataFrame(params, columns = ('Penalty', 'C', 'Class_weight', 'Solver'))
model_params

Unnamed: 0,Penalty,C,Class_weight,Solver
0,none,0.1,,newton-cg
1,none,0.1,,lbfgs
2,none,0.1,,sag
3,none,0.1,,saga
4,none,0.1,balanced,newton-cg
5,none,0.1,balanced,lbfgs
6,none,0.1,balanced,sag
7,none,0.1,balanced,saga
8,l2,0.1,,newton-cg
9,l2,0.1,,lbfgs


### Обучение моделей на необработанных текстах

In [35]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
l = int(len(params))
s = np.zeros((l, 7))

for i in range(l):
    model = LogisticRegression(penalty = params[i][0],
                               C = params[i][1],
                               class_weight = params[i][2],
                               solver = params[i][3])
    
    k = 0
    for train_index, test_index in kf.split(untreated_words_train):
        x_train, x_test = untreated_words_train[train_index], untreated_words_train[test_index]
        y_train, y_test = overalls_train[train_index], overalls_train[test_index]
        
        model.fit(x_train, y_train)
        prediction = model.predict(x_test)
        s[i][k] = accuracy_score(y_test, prediction)
        k += 1
    s[i][5] = np.mean(s[i][:5]) # считаем среднее
    s[i][6] = np.var(s[i][:5]) # считаем дисперсию
    
    

Unnamed: 0,s1,s2,s3,s4,s5,Es,Ds
0,0.827857,0.795714,0.832143,0.818571,0.810714,0.817,0.000168
1,0.833571,0.8,0.84,0.816429,0.825,0.823,0.000195
2,0.837857,0.815714,0.842143,0.834286,0.826429,0.831286,8.7e-05
3,0.841429,0.815714,0.842857,0.836429,0.832143,0.833714,9.5e-05
4,0.827857,0.792857,0.827857,0.813571,0.81,0.814429,0.000169
5,0.835,0.8,0.837857,0.835,0.825714,0.826714,0.000195
6,0.835714,0.81,0.83,0.822857,0.819286,0.823571,7.8e-05
7,0.836429,0.817857,0.829286,0.828571,0.819286,0.826286,4.7e-05
8,0.77,0.716429,0.789286,0.800714,0.786429,0.772571,0.000884
9,0.77,0.716429,0.789286,0.800714,0.786429,0.772571,0.000884


In [38]:
scores = pd.DataFrame(s, columns = ('s1', 's2', 's3', 's4', 's5', 'Es', 'Ds'))
pd.options.display.max_rows = 70
scores.sort_values(by = 'Es')

Unnamed: 0,s1,s2,s3,s4,s5,Es,Ds
8,0.77,0.716429,0.789286,0.800714,0.786429,0.772571,0.000884
9,0.77,0.716429,0.789286,0.800714,0.786429,0.772571,0.000884
11,0.77,0.716429,0.789286,0.800714,0.786429,0.772571,0.000884
12,0.77,0.716429,0.789286,0.800714,0.786429,0.772571,0.000884
10,0.77,0.716429,0.789286,0.800714,0.787143,0.772714,0.000888
17,0.788571,0.782857,0.807143,0.785714,0.810714,0.795,0.000134
14,0.789286,0.782857,0.806429,0.786429,0.81,0.795,0.000122
13,0.789286,0.782857,0.806429,0.786429,0.81,0.795,0.000122
16,0.789286,0.782857,0.806429,0.786429,0.81,0.795,0.000122
15,0.787857,0.784286,0.807143,0.786429,0.81,0.795143,0.000122


###  Выбор модели для необработанных текстов
Самый большой показатель в среднем у модели 62, и из топ-5 отсортированных по среднему у нее наименьшая дисперсия - выбираем ее.

In [37]:
number = 62
model = LogisticRegression(penalty = params[number][0],
                               C = params[number][1],
                               class_weight = params[number][2],
                               solver = params[number][3])
model.fit(untreated_words_train, overalls_train)
prediction = model.predict(untreated_words_test)
score = accuracy_score(overalls_test, prediction)
print(score)

0.845


### Небольшой анализ моделей
1) Можно заметить, что худший результат дали модели, у которых коэффициент регуляризации 0.1 - 0.5. Делаем вывод, что этот коэффициент должен быть больше. \
2) В целом, модели без регуляризации работают хуже, и чем больше коэффициент рег-ции, тем модель лучше справляется \
3) Значение параметра class weight = balanced ухудшает результат

### Обучение моделей на обработанных текстах

In [43]:
s2 = np.zeros((l, 7))

for i in range(l):
    model = LogisticRegression(penalty = params[i][0],
                               C = params[i][1],
                               class_weight = params[i][2],
                               solver = params[i][3])
    
    k = 0
    for train_index, test_index in kf.split(treated_words_train):
        x_train, x_test = treated_words_train[train_index], treated_words_train[test_index]
        y_train, y_test = overalls_train[train_index], overalls_train[test_index]
        
        model.fit(x_train, y_train)
        prediction = model.predict(x_test)
        s2[i][k] = accuracy_score(y_test, prediction)
        k += 1
    s2[i][5] = np.mean(s[i][:5]) # считаем среднее
    s2[i][6] = np.var(s[i][:5]) # считаем дисперсию
        

In [44]:
scores2 = pd.DataFrame(s2, columns = ('s1', 's2', 's3', 's4', 's5', 'Es', 'Ds'))
scores2.sort_values(by = 'Es')

Unnamed: 0,s1,s2,s3,s4,s5,Es,Ds
15,0.787143,0.765714,0.779286,0.755714,0.779286,0.773429,0.000126
16,0.787143,0.765714,0.779286,0.755714,0.78,0.773571,0.000128
14,0.787143,0.765714,0.779286,0.755714,0.78,0.773571,0.000128
13,0.787143,0.765714,0.779286,0.755714,0.78,0.773571,0.000128
17,0.787857,0.765714,0.779286,0.755714,0.779286,0.773714,0.000132
4,0.789286,0.773571,0.778571,0.767143,0.764286,0.774571,7.9e-05
6,0.789286,0.773571,0.778571,0.767143,0.764286,0.774571,7.9e-05
7,0.79,0.773571,0.778571,0.767857,0.764286,0.774714,7.7e-05
5,0.79,0.772857,0.778571,0.767857,0.765,0.774857,7.9e-05
12,0.776429,0.717143,0.790714,0.804286,0.792857,0.776286,0.000953


#####  Выбор модели для обработанных текстов
Самый большой показатель в среднем у моделей 58, 59, 61, и из топ-5 отсортированных по среднему у 59 наименьшая дисперсия - выбираем ее.

In [45]:
number2 = 59
model = LogisticRegression(penalty = params[number2][0],
                               C = params[number2][1],
                               class_weight = params[number2][2],
                               solver = params[number2][3])
model.fit(treated_words_train, overalls_train)
prediction = model.predict(treated_words_test)
score = accuracy_score(overalls_test, prediction)
print(score)

0.8236666666666667


### Выводы
1) Предобработка данных ухудшила результат
2) Для обработанных текстов нельзя однозначно сказать, что чем больше коэффициент регуляризации, тем лучше результат, но в целом с регуляризацией лучше, чем без нее.