## Установка и импорт библиотек

In [None]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.3 MB/s 
[?25hCollecting docopt>=0.6
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 8.3 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13723 sha256=53adaf4723aa39268921b4f2ee13cc7141a98be09c7b72f91f2536dfd2c23187
  Stored in directory: /root/.cache/pip/wheels/72/b0/3f/1d95f96ff986c7dfffe46ce2be4062f38ebd04b506c77c81b9
Successfully built docopt
Installing collected p

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
from collections import Counter
import re

nltk.download('punkt')
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Сбор данных

In [84]:
import requests
from bs4 import BeautifulSoup
import time

In [85]:
url_bad = 'https://www.kinopoisk.ru/film/263531/reviews/?status=bad&ord=rating&rnd=1664480428&perpage=100'
url_good = 'https://www.kinopoisk.ru/film/263531/reviews/?status=good&ord=rating&rnd=1664480443&perpage=100'

r_bad = requests.get(url_bad)

time.sleep(60)

r_good = requests.get(url_good)

In [86]:
def get_reviews(response):
    soup = BeautifulSoup(response.text, 'lxml')
    tags = soup.find_all('div', class_='reviewItem userReview')

    assert len(tags) > 0
    reviews = []
    for tag in tags:
        reviews.append(tag.find('div', class_='brand_words').find('span', class_="_reachbanner_").text)
    return reviews

In [100]:
bad_reviews = get_reviews(r_bad)
good_reviews = get_reviews(r_good)

In [91]:
len(bad_reviews), len(good_reviews)

(87, 100)

In [101]:
with open('marvel_bad_reviews.txt', 'w') as f:
    for review in bad_reviews:
        f.write(review)

with open('marvel_good_reviews.txt', 'w') as f:
    for review in good_reviews:
        f.write(review)

## Делим на train/test подвыборки

In [92]:
from sklearn.model_selection import train_test_split

good_reviews = good_reviews[:87]

bad_reviews_train, bad_reviews_test = train_test_split(bad_reviews, test_size=0.2, random_state=42)
good_reviews_train, good_reviews_test = train_test_split(good_reviews, test_size=0.2, random_state=42)

## Основная часть

In [94]:
morph = pymorphy2.MorphAnalyzer()


def preprocess(reviews, filter=True, threshold=2):
    """
    принимает отзывы и возвращает множество начальных форм слов, 
    которые входят в эти отзывы
    """
    reviews_str = ' '.join(reviews)
    reviews_str = reviews_str.lower()

    cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    reviews_str = re.sub(cleaner, '', reviews_str)
    reviews_str = re.sub('\n', ' ', reviews_str)
    reviews_str = re.sub('\r', '', reviews_str)

    tokens = word_tokenize(reviews_str)
    tokens = [t for t in tokens if t not in russian_stopwords]
    
    lemmas = [morph.parse(t)[0].normal_form for t in tokens]
    if filter:
        counter = Counter(lemmas)
        lemmas = [l for l in lemmas if counter[l] >= threshold]
    return set(lemmas)

In [95]:
good_lemmas = preprocess(good_reviews_train)
bad_lemmas = preprocess(bad_reviews_train)

In [96]:
only_good_words = good_lemmas - bad_lemmas
only_bad_words = bad_lemmas - good_lemmas

In [97]:
def predict(review):
    """
    возвращает '+' если предсказывает, что положительный
    '-' иначе
    """
    bad_counter, good_counter = 0, 0
    lemmas = preprocess([review])
    for lemma in lemmas:
        if lemma in only_good_words:
            good_counter += 1
        elif lemma in only_bad_words:
            bad_counter += 1
    if good_counter > bad_counter:
        return '+'
    return '-'

In [98]:
def calculate_accuracy(good_test, bad_test):
    correct = 0
    for review in good_test:
        if predict(review) == '+':
            correct += 1
    for review in bad_test:
        if predict(review) == '-':
            correct += 1
    return float(correct) / (len(good_test) + len(bad_test))

In [99]:
calculate_accuracy(good_reviews_test, bad_reviews_test)

0.6666666666666666

*последний пункт: мулька номер один это добыть больше данных богу данных....*