In [21]:
import re

import joblib
import nltk
import numpy as np
import pandas as pd
import pymorphy2
from nltk.corpus import stopwords
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tinctura/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv('task/dataset/train.tsv', sep='\t')

In [4]:
stop_words = stopwords.words("russian")
stop_words.append("изза")

morph = pymorphy2.MorphAnalyzer()


def remove_punctuations(text: str) -> str:
    """Remove punctuation from given text"""
    return re.sub('\[[^]]*\]', '', text)


def remove_characters(text: str) -> str:
    """Remove characters from given text"""
    return re.sub('[^а-яА-Я]', ' ', text)


def remove_stopwords(text: str) -> str:
    """Remove stop words from given text"""
    result = []
    for token in text.split():
        if token not in stop_words:
            result.append(token)
    return " ".join(result)


def normalize_text(text: str) -> str:
    """Normalize each word in given text"""
    normalized = []
    for element in text.split():
        normalized.append(morph.parse(element)[0].normal_form)
    return " ".join(normalized)


def clear_text(text: str) -> str:
    """Cleans given text"""
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_characters(text)
    text = remove_stopwords(text)
    text = normalize_text(text)
    return text


data["title_normalized"] = data["title"].apply(clear_text)
data.head(2)

Unnamed: 0,title,is_fake,title_normalized
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1,москвич владимир клутина приша литр сч том вме...
1,Агент Кокорина назвал езду по встречке житейск...,0,агент кокорин назвать езда встречок житейский ...


In [5]:
text = data['title_normalized']
targets = data['is_fake']

In [6]:
%%time
word_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=10000)

word_vectorizer.fit(text)
train_word_features = word_vectorizer.transform(text)

CPU times: user 229 ms, sys: 22.2 ms, total: 251 ms
Wall time: 258 ms


In [15]:
clf = MultinomialNB(alpha=0.9)
clf.fit(train_word_features, targets)

MultinomialNB(alpha=0.9)

In [16]:
cv_score = np.mean(cross_val_score(clf, train_word_features, targets, cv=5, scoring='roc_auc'))
cv_score

0.9272261536332529

In [20]:
test_obj = "Робот Фёдор пожаловался, что никто не поздравил его с Днём космонавтики"
vectorized_test_obj = word_vectorizer.transform([test_obj])
clf.predict_proba(vectorized_test_obj)

array([[0.39465699, 0.60534301]])

In [40]:
with open('fake_news_vectorizer_dump.pkl', 'wb') as output_file:
    joblib.dump(word_vectorizer, output_file)

In [41]:
with open('fake_news_model_dump.pkl', 'wb') as output_file:
    joblib.dump(clf, output_file)