# Разработка классификатора новостей

**Нужно:**
* выбрать какой-либо новостной ресурс, где к новостям привязаны категории или метки (например http://lenta.ru, http://fontanka.ru, http://gazeta.ru)
* загрузить новости по некоторому набору (5-10) категорий за пару лет
* обучить классификатор на эти новостях
* продемонстрировать его работу, разработав простеший web-интерфейс (вариант - telegram-бот), куда пользователь вводит текст новости и на выходе получает наиболее вероятную категорию. В качестве фреймворка проще всего взять [Flask](http://flask.pocoo.org) (см. примеры) .

In [6]:
# -*- coding: utf-8 -*-

import lxml.html
import lxml.etree
import requests

import pymorphy2
import re
import math
import operator
import numpy as np
import pickle
from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split



# Парсинг
** Выбран ресурс fontanka. Берутся новости за 2 года (не больше 2000 новостей на одну тему). **

In [89]:
# По календарю получаю список URL ведущих на новости этого дня

def parse_category_by_url(url):
    response = requests.get(url)
    if (response.status_code == 200):
        res = []
        tree = lxml.html.fromstring(response.text)
        links = tree.xpath('//div[contains(@class, "big_center ml30 fll")]//td[contains(@class, "dom")]')
        for link in links:
            try:
                formed_link = form_url_by_suburl(link.getchildren()[0].get("href"))                
                res.append(formed_link)
            except:
                pass
        return res
        
    return []

In [90]:
# По странице с новостями дня получаю список URL ведущих на новость

def parse_news_on_page(url):
    response = requests.get(url)
    if (response.status_code == 200):
        res = []
        tree = lxml.html.fromstring(response.text)
        links = tree.xpath('//div[contains(@class, "calendar-list")]//div[contains(@class, "calendar-item-title")]')
        for link in links:
            try:
                formed_link = form_url_by_suburl(link.getchildren()[0].get("href"))
                if len(formed_link)>0:
                    res.append(formed_link)
            except:
                pass
        return res
    return []

In [91]:
# Из относительных ссылок формирую абсолютную

def form_url_by_suburl(suburl):
    if suburl[0] == '/':
        return 'http://www.fontanka.ru' + suburl
    return ''

In [102]:
# Получаю текст и заголовок новости

def get_content(url):
    response = requests.get(url)
    if (response.status_code == 200):        
        analyzer = pymorphy2.MorphAnalyzer()
        #words_from_query = {analyzer.normal_forms(x)[0] for x in query.split(' ')}
        tree = lxml.html.fromstring(response.text)
        text = tree.xpath('//div[contains(@class, "article_fulltext")]')
        arttitle = tree.xpath('//h1[contains(@class, "article_title")]')
        try:
            title = arttitle[0].text_content()
            #print(text[0].text_content())
            res = ''.join(text[0].text_content().replace('\t', ' ').replace('\n', ' ').splitlines())
        except:
            title = ''
            res = ['']
    try:
        return (title, res)
    except:
        return ('', '')

In [93]:
# Обработка и сохранение новости в файл

def save_cat_in_file(url, filename):
    day_links = parse_category_by_url(url)
    f = open(filename, 'w')
    for day_link in tqdm(day_links):
        news_links = parse_news_on_page(day_link)
        for news_link in news_links:
            (title, text) = get_content(news_link)
            try:
                f.write(f"{title}\t{text}\n")
            except:
                pass
    f.close()

In [104]:
def get_big_data():
    date_cat = re.compile(r'/(\w*)/')
    links = []
    start_url = "http://www.fontanka.ru/fontanka/arc/2015/all.html"
    response = requests.get(start_url)
    if (response.status_code == 200):
        tree_categories = lxml.html.fromstring(response.text)
        categories_links = tree_categories.xpath('//div[contains(@class, "calendar-menu")]//select[contains(@name, "categoryselect")]//option')
        if (len(categories_links) > 1):
            for category_link in tqdm_notebook(categories_links):
                    documents_count = 0
                    url = form_url_by_suburl(category_link.get("value"))
                    
                    
                   ################################################################################################################################### 
                   # Файлы, которые уже обработал, но вылетела ошибка. Если надо снова пропарсить, закомментить!!                                    #
                   # already_processed = ['money', 'spb', 'media', 'politic', 'fontantv', 'bt', 'business', 'culturecity', 'autop', 'charity', 'zhkh']#
                   ###################################################################################################################################
                    
                    
                    
                    if (date_cat.findall(url)[1] == 'fontanka' or date_cat.findall(url)[1] in already_processed):
                        continue
                    with open(date_cat.findall(url)[1] + '.txt', 'w') as f:
                        response_category = requests.get(url)
                        tree_year = lxml.html.fromstring(response_category.text)
                        years = tree_year.xpath('//div[contains(@class, "calendar-menu")]//select[contains(@name, "yearselect")]//option')
                        if (len(years) > 1):
                            years_count = 0
                            for year in years:                    
                                days_links = parse_category_by_url(form_url_by_suburl(year.get("value")))
                                for day_link in days_links:
                                    news_link = parse_news_on_page(day_link)
                                    for new_link in news_link:
                                        if documents_count > 2500:
                                            continue # Раз по кайфу, питон, без проблем -- продолжай
                                        (title, text) = get_content(new_link)
                                        try:
                                            f.write((title + ' ' + text).replace('\n', ' ') + '\n')
                                            documents_count = documents_count + 1
                                   
                                        except:
                                            pass
                                years_count = years_count + 1
                                if years_count == 2:
                                    break
get_big_data()

A Jupyter Widget




In [4]:
# Будем работать только с файлами где больше 500 документов

valid_files = []
import os
for file in os.listdir():
    if ".txt" in file:
        with open(file, 'r') as f:
            count_lines = 0
            for line in f.readlines():
                count_lines = count_lines + 1
            if count_lines > 500:
                valid_files.append(file)
print(valid_files)

['autop.txt', 'business.txt', 'cat_doc.txt', 'finances.txt', 'fontantv.txt', 'incd.txt', 'media.txt', 'politic.txt', 'realty.txt', 'society.txt', 'spb.txt', 'sport.txt', 'stroy.txt', 'technology.txt', 'turizm.txt', 'witness.txt', 'zhkh.txt']


In [5]:
# Убираем стоп слова из словаря

def get_rid_of_stop_words(dict_words):
    with open('stopwords.txt', 'r') as f:
        for line in f.readlines():
            for word in line.split(' '):
                if word in dict_words:
                    del dict_words[word]
    return dict_words

In [1]:
"""
Выполнение обработки распарсенных файлов с новостями: все слова приводятся к начальной форме, убираются знаки и т.п.
"""

def normilize_file(infilename, outfilename):
    analyzer = pymorphy2.MorphAnalyzer()
    with open(infilename, 'r') as infile, open(outfilename, 'w') as outfile:
        for line in tqdm_notebook(infile.readlines()):
            words = (word for word in re.split('\W+', line) if len(word) > 0)
            norm_form = (analyzer.normal_forms(word)[0] for word in words)
            outfile.write(' '.join(norm_form) + '\n')

def normilize_all_files():
    for file in valid_files:
        infile = file
        outfile = 'normilized_categories/' + file
        normilize_file(infile, outfile)

In [20]:
# Обрабатываем тексты

import os

not_normilized_fs = os.listdir('no_normilize/')
analyzer = pymorphy2.MorphAnalyzer()
for file in not_normilized_fs:
    with open('no_normilize/' + file, 'r') as f, open('normilize/' + file, 'w') as o:
          for line in f.readlines():
            words = (word for word in re.split('\W+', line) if len(word) > 0)
            norm_form = (analyzer.normal_forms(word)[0] for word in words)
            o.write(' '.join(norm_form) + '\n')

In [None]:
# разделяем выборки: 80% отсавляем под обучение, 20% под тесты

import os

not_normilized_fs = os.listdir('normilize/')
analyzer = pymorphy2.MorphAnalyzer()
for file in not_normilized_fs:
    with open('normilize/' + file, 'r') as f:
        count_documents = sum(1 for _ in f)
    with open('normilize/' + file) as f: 
        with open('train_data.txt', 'w') as td, open('test_data.txt', 'w') as test:
            count_lines = 0
            for line in f.readlines():
                
                if count_lines < count_documents * 0.8:

                    td.write(file.reaplce('.txt', '') + '\t' + line)
                    count_lines = count_lines + 1
                else:
                    test.write(file.reaplce('.txt', '') + '\t' + line)

# Обучение
** Используются решения Scikit Learn. Выбранный классификатор - SVM (хочется познакомиться + подходящая область применения). **

In [3]:
# Обучение

def get_learned_svm():
    docs = []
    y = []
    with open('train_data.txt', 'r') as f:
        for line in f.readlines():
            docs.append(line.split('\t')[1])
            y.append(line.split('\t')[0])
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(docs)
    svm = SVC(kernel='linear', verbose=True)
    svm.fit(X, y)
    
    docs_train = []
    y_train = []
    with open('test_data.txt', 'r') as f:
        for line in f.readlines():
            docs_test.append(line.split('\t')[1])
            y_test.append(line.split('\t')[0])
    
    vectorizer_test = TfidfVectorizer(min_df=1, vocabulary=vectorizer.vocabulary_)
    X_test = vectorizer.fit_transform(docs_test)
    pred = svm.predict(X_test)
    print('На тестовых данных: ')
    print(svm.score(X_test, y_test))
    return svm, vectorizer.vocabulary_

get_learned_svm()

1621 business
4122 cat_doc
8964 finances
11463 fontantv
12380 incd
14881 media
17382 politic
19883 realty
21151 society
23652 spb
26153 sport
28654 stroy
30259 technology
31246 turizm
32170 witness
33839 zhkh


In [14]:
def initialize_tfidf():
    docs = []
    y = []
    with open('cat_doc.txt', 'r') as f:
        docs_in_category = 0
        for line in f.readlines():
            docs.append(line.split('\t')[1])
            y.append(line.split('\t')[0])
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(docs)
    return vectorizer

In [126]:
vect = vectorizer_to_our.fit_transform(['popka'])
cat = svm.predict(vect)
cat[0]

'politic'