## Парсинг новостей

In [None]:
import datetime as DT
import pandas as pd
import requests
import csv
from urllib.request import urlopen
from pandas.io.json import json_normalize
import json

start_date = DT.datetime(2010, 1, 1)
end_date = DT.datetime(2020, 1, 1)

dates = pd.date_range(
    min(start_date, end_date),
    max(start_date, end_date), 
    freq = 'MS').strftime('%m/%d/%Y').tolist()

dates2 = dates[1:]

lst = []
k = 100
for start, end in zip(dates, dates2):
    for i in range(0, 5000, 100):
        url = "https://www.rbc.ru/v10/search/ajax/?project=quote&dateFrom={}&dateTo={}&offset={}&limit={}&query=%D0%A0%D0%91%D0%9A".format(start, end, i ,k)
        response = urlopen(url)
        data = json.loads(response.read())
        df = pd.DataFrame.from_dict(data)
        df = json_normalize(df['items'].apply(json.dumps).apply(json.loads))
        lst.append(df)

df_news = pd.concat(lst, ignore_index=True)

#удаление лишней информации и дубликатов
df_news = df_news[["anons", "category", "publish_date", "title"]]
df_news = df_news.drop_duplicates(['title'], ignore_index = True)

  df = json_normalize(df['items'].apply(json.dumps).apply(json.loads))


In [None]:
df_news.head()

## Обработка и агрегация новостных заголовков

In [None]:
import nltk
from pymystem3 import Mystem
from collections import Counter
import re
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
pd.set_option('display.max_columns', 10000) 

data = pd.read_csv("/Users/yanasidikova/data.csv")
data = data[["date", "datetime", "title", "category"]]
data = data.drop_duplicates()

In [None]:
for i in range(0, len(data2)):
    data["title"][i] = str(data["title"][i])

In [None]:
def tokenize_ru(file_text):
    tokens = nltk.word_tokenize(file_text)
    tokens = [i for i in tokens if (i not in string.punctuation)]
    stop_words = stopwords.words('russian')
    stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', '–', 'к', 'на', '...', '``', "''", ",", "»", "«", "из-за", "свой", "num", "без", "ко"])
    tokens = [i for i in tokens if (i not in stop_words)]
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    return tokens

m = Mystem()
def lemmatize_sentence(text):
    lemmas = m.lemmatize(text)
    return "".join(lemmas).strip()


def preprocessing(df):
    df2 = df
    df2["text"] = ""
    for i in range(0, len(df2)):
        df2["title"].iloc[i] = re.sub('(\d)+', 'NUM', df2["title"].iloc[i])

        df2["text"].iloc[i] = lemmatize_sentence(df2["title"].iloc[i].lower())
        #df2["text"].iloc[i] = tokenize_ru(df2["text"].iloc[i])
        #print(i)
    return df2

def aggregation(df, var):
    grouped_df = df.groupby(var)
    grouped_lists = grouped_df["text"].agg(lambda column: " ".join(column))
    grouped_lists = grouped_lists.reset_index(name="text2")
    grouped_lists["numwords"] = ""
    for i in range(0, len(grouped_lists)):
        grouped_lists["text2"].iloc[i] = tokenize_ru(grouped_lists["text2"].iloc[i])
        grouped_lists["numwords"].iloc[i] = len(grouped_lists["text2"].iloc[i])
        #print(i)
    return grouped_lists


def counting_words(df):
    text = df['text2'].tolist()
    flat_text = []
    for sublist in text:
        for item in sublist:
            flat_text.append(item)
    counts = Counter(flat_text)
    numwords = pd.DataFrame([{"word": word, "count": count} for word, count in counts.items()])
    return numwords

def index_counter(df, wordlist, t):
    wordlist = wordlist[["word", "rate"]]
    dic = wordlist.set_index(["word"]).to_dict()["rate"]
    rating = []
    value = 0
    for row in df['text2']:
        for word in row:
            value += dic.get(word,0)
        rating.append(value)
        value = 0
    index = pd.DataFrame()
    index['t'] = df[t]
    index['num'] = df['numwords']
    index['indexv'] = rating
    index['ind'] = ''
    index['ind'] = index.groupby(['t'], group_keys=False).apply(lambda x: x.indexv/x.num)
    index = index[['t', 'ind']]
    return index

In [None]:
data_cleaned = preprocessing(data)

In [None]:
politics = data_cleaned[data_cleaned.category == 'Политика']
finance = data_cleaned[(data_cleaned.category == 'Финансы') | (data_cleaned.category == 'Экономика')]
business = data_cleaned[data_cleaned.category == 'Бизнес']data = data[['date', 'title']]

In [None]:
data_daily = aggregation(data_cleaned, "date")
data_hourly = aggregation(data_cleaned, "datetime")
pol_daily = aggregation(politics, "date")
pol_hourly = aggregation(politics, "datetime")
fin_daily = aggregation(finance, "date")
fin_hourly = aggregation(finance, "datetime")
bus_daily = aggregation(business, "date")
bus_hourly = aggregation(business, "datetime")

## Создание словарей и расчет индексов тональности

In [None]:
wordlist = counting_words(data_daily)
wordlist_pol = counting_words(pol_daily)
wordlist_fin = counting_words(fin_daily)
wordlist_bus = counting_words(bus_daily)

pos_d = pd.read_csv("/Users/yanasidikova/positive_news_daily.csv")
pos_h = pd.read_csv("/Users/yanasidikova/hrl_pos.csv")
pos_d = pos_d.drop_duplicates()
pos_h = pos_h.drop_duplicates()
pos_d = pos_d["title"].tolist()
pos_h = pos_hourly["title"].tolist()

positive_daily = data_cleaned[data_cleaned['title'].isin(pos_d)]
positive_hourly = data_cleaned[data_cleaned['title'].isin(pos_h)]
positive_daily['text2'] = positive_daily['text']
positive_hourly['text2'] = positive_hourly['text']
pos_pol_daily = positive_daily[positive_daily.category == 'Политика']
pos_pol_hourly = positive_hourly[positive_hourly.category == 'Политика']
pos_fin_daily = positive_daily[(positive_daily.category == 'Финансы') | (positive_daily.category == 'Экономика')]
pos_fin_hourly = positive_hourly[(positive_hourly.category == 'Финансы') | (positive_hourly.category == 'Экономика')]
pos_bus_daily = positive_daily[positive_daily.category == 'Бизнес']
pos_bus_hourly = positive_hourly[positive_hourly.category == 'Бизнес']

positive_daily = aggregation(positive_daily, "date")
positive_hourly = aggregation(positive_hourly, "datetime")
pos_pol_daily = aggregation(pos_pol_daily, "date")
pos_pol_hourly = aggregation(pos_pol_hourly, "datetime")
pos_fin_daily = aggregation(pos_fin_daily, "date")
pos_fin_hourly = aggregation(pos_fin_hourly, "datetime")
pos_bus_daily = aggregation(pos_bus_daily, "date")
pos_bus_hourly = aggregation(pos_bus_hourly, "datetime")

pos_wordlist_d = counting_words(positive_daily)
pos_wordlist_h = counting_words(positive_hourly)
pos_wordlist_pol_d = counting_words(pos_pol_daily)
pos_wordlist_pol_h = counting_words(pos_pol_hourly)
pos_wordlist_fin_d = counting_words(pos_fin_daily)
pos_wordlist_fin_h = counting_words(pos_fin_hourly)
pos_wordlist_bus_d = counting_words(pos_bus_daily)
pos_wordlist_bus_h = counting_words(pos_bus_hourly)

In [None]:
wordlist_d = pd.merge(wordlist,
                 pos_wordlist_d,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_h = pd.merge(wordlist,
                 pos_wordlist_h,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_pol_d = pd.merge(wordlist_pol,
                 pos_wordlist_pol_d,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_pol_h = pd.merge(wordlist_pol,
                 pos_wordlist_pol_h,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_fin_d = pd.merge(wordlist_fin,
                 pos_wordlist_fin_d,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_fin_h = pd.merge(wordlist_fin,
                 pos_wordlist_fin_h,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_bus_d = pd.merge(wordlist_bus,
                 pos_wordlist_bus_d,
                 on='word', 
                 how='left').replace(np.nan, 0)
wordlist_bus_h = pd.merge(wordlist_bus,
                 pos_wordlist_bus_h,
                 on='word', 
                 how='left').replace(np.nan, 0)

for df in (wordlist_d, wordlist_h, wordlist_pol_d, wordlist_pol_h, wordlist_fin_d, wordlist_fin_h, wordlist_bus_d, \
           wordlist_bus_h):
    df['rate'] = df.groupby(['word'], group_keys=False).apply(lambda g: g.count_y/g.count_x)
    
for df in (wordlist_d, wordlist_h):
    df = df[(df.count_x >= 400) & (df.count_y >= 10)]
for df in (wordlist_pol_d, wordlist_pol_h, wordlist_fin_d, wordlist_fin_h, wordlist_bus_d, wordlist_bus_h):
    df = df[(df.count_x >= 100) & (df.count_y >= 10)]

In [None]:
d_ind = index_counter(data_daily, wordlist_d, "date")
d_ind_p = index_counter(pol_daily, wordlist_pol_d, "date")
d_ind_f = index_counter(fin_daily, wordlist_fin_d, "date")
d_ind_b = index_counter(bus_daily, wordlist_bus_d, "date")
h_ind = index_counter(data_hourly, wordlist_h, "datetime")
h_ind_p = index_counter(pol_hourly, wordlist_pol_h, "datetime")
h_ind_f = index_counter(fin_hourly, wordlist_fin_h, "datetime")
h_ind_b = index_counter(bus_hourly, wordlist_bus_h, "datetime")

day = pd.DataFrame()
day['t'] = d_ind['t']
day['ind'] = d_ind['ind']
day = pd.merge(day,
                 d_ind_p[['t', 'ind']],
                 on='t', 
                 how='left', suffixes=('','_p')).replace(np.nan, 0)
day = pd.merge(day,
                 d_ind_f[['t', 'ind']],
                 on='t', 
                 how='left', suffixes=('','_f')).replace(np.nan, 0)
day = pd.merge(day,
                 d_ind_b[['t', 'ind']],
                 on='t', 
                 how='left', suffixes=('','_b')).replace(np.nan, 0)
day.to_csv("daily4M.csv")

hour = pd.DataFrame()
hour['t'] = h_ind['t']
hour['ind'] = h_ind['ind']
hour = pd.merge(hour,
                 h_ind_p[['t', 'ind']],
                 on='t', 
                 how='left', suffixes=('','_p')).replace(np.nan, 0)
hour = pd.merge(hour,
                 h_ind_f[['t', 'ind']],
                 on='t', 
                 how='left', suffixes=('','_f')).replace(np.nan, 0)
hour = pd.merge(hour,
                 h_ind_b[['t', 'ind']],
                 on='t', 
                 how='left', suffixes=('','_b')).replace(np.nan, 0)
hour.to_csv("hourly4M.csv")