# Разработка классификатора новостей

**Нужно:**
* выбрать какой-либо новостной ресурс, где к новостям привязаны категории или метки (например http://lenta.ru, http://fontanka.ru, http://gazeta.ru)
* загрузить новости по некоторому набору (5-10) категорий за пару лет
* обучить классификатор на эти новостях
* продемонстрировать его работу, разработав простеший web-интерфейс (вариант - telegram-бот), куда пользователь вводит текст новости и на выходе получает наиболее вероятную категорию. В качестве фреймворка проще всего взять [Flask](http://flask.pocoo.org) (см. примеры) .

In [319]:
# Экземпляр классификатора в пикле, получение в последней ячейке

import lxml.html
import lxml.etree
import requests
import pymorphy2
import re
import math
import operator
import numpy as np
import pickle
from tqdm import tqdm_notebook

In [25]:
# По календарю получаю список URL ведущих на новости этого дня

def parse_category_by_url(url):
    response = requests.get(url)
    if (response.status_code == 200):
        res = []
        tree = lxml.html.fromstring(response.text)
        links = tree.xpath('//div[contains(@class, "big_center ml30 fll")]//td[contains(@class, "dom")]')
        for link in links:
            try:
                formed_link = form_url_by_suburl(link.getchildren()[0].get("href"))                
                res.append(formed_link)
            except:
                pass
        return res
        
    return []

In [26]:
# По странице с новостями дня получаю список URL ведущих на новость

def parse_news_on_page(url):
    response = requests.get(url)
    if (response.status_code == 200):
        res = []
        tree = lxml.html.fromstring(response.text)
        links = tree.xpath('//div[contains(@class, "calendar-list")]//div[contains(@class, "calendar-item-title")]')
        for link in links:
            try:
                formed_link = form_url_by_suburl(link.getchildren()[0].get("href"))
                if len(formed_link)>0:
                    res.append(formed_link)
            except:
                pass
        return res
    return []

In [27]:
# Из относительных ссылок формирую абсолютную

def form_url_by_suburl(suburl):
    if suburl[0] == '/':
        return 'http://www.fontanka.ru' + suburl
    return ''

In [28]:
# Получаю текст и заголовок новости

def get_content(url):
    response = requests.get(url)
    if (response.status_code == 200):        
        analyzer = pymorphy2.MorphAnalyzer()
        #words_from_query = {analyzer.normal_forms(x)[0] for x in query.split(' ')}
        tree = lxml.html.fromstring(response.text)
        text = tree.xpath('//div[contains(@class, "article_fulltext")]')
        arttitle = tree.xpath('//h1[contains(@class, "article_title")]')
        try:
            title = arttitle[0].text_content()
            #print(text[0].text_content())
            res = ''.join(text[0].text_content().replace('\t', ' ').replace('\n', ' ').splitlines())
        except:
            title = ''
            res = ['']
    return (title, res)

In [37]:
#Обработка и сохранение новости в файл

def save_cat_in_file(url, filename):
    day_links = parse_category_by_url(url)
    f = open(filename, 'w')
    for day_link in tqdm(day_links):
        news_links = parse_news_on_page(day_link)
        for news_link in news_links:
            (title, text) = get_content(news_link)
            try:
                f.write(f"{title}\t{text}\n")
            except:
                pass
    f.close()

In [90]:
# Нормализуем все слова в файлах

def normilize_files():
    files = [
            'buisness',
            'politic',
            'realty',
            'sport',
            'technology',
            'tourism',
            'video'
        ]
    analyzer = pymorphy2.MorphAnalyzer()
    for file in tqdm_notebook(files):
        with open('categories/' + file + '.txt', 'r') as f:
            file_list = []
            for line in tqdm_notebook(f.readlines()):
                line_list = [analyzer.normal_forms(x)[0] for x in line.split(' ')]                
                file_list.append(' '.join(line_list))
        with open('categories/' + file + '_normal.txt', 'w') as f:
            f.writelines(file_list)
            
normilize_files()

A Jupyter Widget

A Jupyter Widget




Exception in thread Thread-16:
Traceback (most recent call last):
  File "C:\Users\Veotani\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "C:\Users\Veotani\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Users\Veotani\Anaconda3\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget




In [296]:
#убираем знаки пунктуации

def delete_all_punct():
    files = [
            'buisness',
            'politic',
            'realty',
            'sport',
            'technology',
            'tourism',
            'video'
        ]
    analyzer = pymorphy2.MorphAnalyzer()
    for file in tqdm_notebook(files):
        with open('categories/' + file + '_normal.txt', 'r') as f:
            lines = []
            for line in f.readlines():
                lines.append(re.sub(pattern='[^a-zA-Z]', string=line.replace(',', ' ').replace('.', ' ').replace('-', ' '), repl=' '))
        with open('categories/' + file + '_normal_nopun.txt', 'w') as f:
            f.writelines(lines)
delete_all_punct()

A Jupyter Widget




In [298]:
def get_rid_of_stop_words(dict_words):
    with open('stopwords.txt', 'r') as f:
        for line in f.readlines():
            if line in dict_words:
                del dict_words[line]
    return dict_words

In [297]:
# Get all the words in our collection

def get_all_words(cat='notcat'):
    res = dict()
    files = [
            'buisness',
            'politic',
            'realty',
            'sport',
            'technology',
            'tourism',
            'video'
        ]
    if (cat in files):
        with open('categories/' + cat + '_normal_nopun.txt', 'r') as f:
            for line in f.readlines():
                for word in line.split(' '):
                    if word not in res:
                        res[word] = 1
                    else:
                        res[word] = res[word] + 1
    else:
        for file in files:
            with open('categories/' + file + '_normal_nopun.txt', 'r') as f:
                for line in f.readlines():
                    for word in line.split(' '):
                        if word not in res:
                            res[word] = 1
                        else:
                            res[word] = res[word] + 1
    return res

In [299]:
def count_docs_in_cat(category):
    f = open('categories/' + category + '_normal.txt', 'r')
    return len(f.readlines())

In [300]:
def count_all_docs():
    files = [
            'buisness',
            'politic',
            'realty',
            'sport',
            'technology',
            'tourism',
            'video'
        ]
    res = 0
    for file in files:
        res = res + count_docs_in_cat(file)
    return res

In [301]:
def count_uniq_words():
    return len(get_all_words())

In [302]:
def count_words_in_cat(cat):
    res = 0
    with open('categories/' + cat + '_normal.txt', 'r') as f:
        for line in f.readlines():
             res = res + len(line.split(' '))
    return res

In [303]:
def get_parameters():
    d = dict()
    categories = [
            'buisness',
            'politic',
            'realty',
            'sport',
            'technology',
            'tourism',
            'video'
        ]
    for cat in categories:
        d[cat] = (count_docs_in_cat(cat), count_words_in_cat(cat))
    return d

In [304]:
class Clasterizator:
    def __init__(self):
        self.count_docs = count_all_docs()
        self.uniq = count_uniq_words()
        self.params = get_parameters()
        self.categories = [
            'buisness',
            'politic',
            'realty',
            'sport',
            'technology',
            'tourism',
            'video'
        ]
        self.tf_cat = self.init_tf_cat()
    def get_cat_of_doc(self, doc):
        d = dict()
        analyzer = pymorphy2.MorphAnalyzer()
        for cat in self.categories:
            words_normilized = [analyzer.normal_forms(x) for x in doc.split(' ')]
            tmp = 0
            for word in words_normilized:
                try:
                    tf_in_cat = self.tf_cat[cat][word]
                except:
                    tf_in_cat = 0
                tmp = tmp + math.log((tf_in_cat + 1)/(self.uniq + self.params[cat][1]))
            d[cat] = math.log(self.params[cat][0]/self.count_docs) + tmp
        print(d)
        return max(d, key=d.get)

        
    def init_tf_cat(self):
        res = dict()
        for cat in self.categories:
            res[cat] = get_all_words(cat)
        return res

In [305]:
cstr = Clasterizator()

In [311]:
cstr.get_cat_of_doc('судьба')

{'buisness': -15.054910981351558, 'politic': -14.97266037465744, 'realty': -15.134650547845972, 'sport': -14.952076946263457, 'technology': -15.16423472806951, 'tourism': -15.136207182692202, 'video': -14.750921110238195}


'video'

In [320]:
with open('cstr.pickle', 'wb') as f:
    pickle.dump(cstr, f)

In [324]:
import telebot

with open('cstr.pickle', 'rb') as f:
     cstr = pickle.load(f)
#token = токен
bot = telebot.TeleBot(token)
cstr = Clasterizator()

@bot.message_handler(content_types=["text"])
def repeat_all_messages(message): # Название функции не играет никакой роли, в принципе
    bot.send_message(message.chat.id, cstr.get_cat_of_doc(message.text))
    print(message.text)

if __name__ == '__main__':
     bot.polling(none_stop=True)

KeyboardInterrupt: 