# Часть 2. Решение задачи анализа тональности текста методами машинного обучения на основе языка программирования Python

In [1]:
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk

In [2]:
file_dir = r'C:\Users\User\!_data_science\course\school\2019_11_07\2019_11_07_intro_ml\notebooks\data\movie_reviews'

# Загрузка
movie = load_files(file_dir, shuffle=True)

In [3]:
movie.data[0][:500]

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so cal"

In [4]:
# Целевой признак
movie.target_names

['neg', 'pos']

### Векторизация текста

In [5]:
cv = CountVectorizer(tokenizer=nltk.word_tokenize)

In [6]:
%%time
cv_data = cv.fit_transform(movie.data)

Wall time: 11.3 s


In [7]:
#Выдача словаря
#cv.vocabulary_

In [8]:
cv_data.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
# Выполняем TF-IDF преобразование над посчитанным количеством слов - https://ru.wikipedia.org/wiki/TF-IDF

In [10]:
ft = TfidfTransformer()

In [11]:
data_tfidf = ft.fit_transform(cv_data)

In [12]:
data_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
data_tfidf.shape

(2000, 46462)

# Построение модели

In [14]:
%%time
X_train, X_test, y_train, y_test = train_test_split(data_tfidf, movie.target, test_size = 0.20, random_state = 1)

Wall time: 0 ns


В качестве модели будем использовать [логистическую регрессию](https://ru.wikipedia.org/wiki/%D0%9B%D0%BE%D0%B3%D0%B8%D1%81%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B0%D1%8F_%D1%80%D0%B5%D0%B3%D1%80%D0%B5%D1%81%D1%81%D0%B8%D1%8F)

In [15]:
lr = LogisticRegression()

In [16]:
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
y_hat = lr.predict(X_test)

In [18]:
acc = accuracy_score(y_test, y_hat)

In [19]:
acc

0.8375

# Точность предсказания составляет около 83%.