In [141]:
%load_ext autoreload
%autoreload 1

import sys
sys.path.append('../src')

from data import news_articles, tone

%aimport data.news_articles
%aimport data.tone

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [142]:
import requests
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np

# Fetch News Articles

In [13]:
relevant_candidates = [
    'Klobuchar',
    'Bloomberg',
    'Buttigieg',
    'Biden',
    'Warren',
    'Sanders',
]

In [43]:
article_data = news_articles.fetch_for_candidates(relevant_candidates)
df_articles = pd.DataFrame.from_records(article_data)

In [209]:
df_articles.to_csv('../data/raw/articles.csv')

# Fetch Full Content

In [56]:
article_urls = df_articles['url'].values

In [67]:
def track_full_text_fetch(urls):
    count = 0
    full_text_records = []
    for url in urls:
        count += 1
        print(count)
        full_text_records.append(news_articles.fetch_full_text(url))
    return full_text_records

In [None]:
article_text_records = track_full_text_fetch(article_urls)

In [72]:
full_text = pd.DataFrame.from_records(article_text_records)

In [210]:
full_text.to_csv('../data/raw/articles_full_text.csv')

# Combine

In [211]:
df_articles_loaded = pd.read_csv('../data/raw/articles.csv')
full_text_loaded = pd.read_csv('../data/raw/articles_full_text.csv')

In [258]:
articles_merged = pd.concat([df_articles_loaded, full_text_loaded], axis=1)
articles_merged = articles_merged.loc[:,~articles_merged.columns.duplicated()]

# Sentiment

In [223]:
articles_sent = articles_merged.copy()

In [79]:
analyzer = SentimentIntensityAnalyzer()
desc_sent = articles_full.description.map(lambda d: analyzer.polarity_scores(d)['compound'])
full_text_sent = articles_full.text.map(lambda d: analyzer.polarity_scores(d)['compound'])

In [82]:
articles_sent['desc_sentiment'] = desc_sent
articles_sent['full_text_sentiment'] = full_text_sent

# Tone

In [283]:
articles_tone = articles_sent.copy()

## Fetch

In [None]:
desc_tone = articles_tone.description.map(tone.get)

In [None]:
full_text_tone = articles_tone.text.map(tone.get)

In [95]:
articles_tone['raw_desc_tone'] = desc_tone
articles_tone['raw_text_tone'] = full_text_tone

## Proccess

In [143]:
for t in ['analytical', 'anger', 'confident', 'fear', 'joy', 'sadness', 'tentative']:
    articles_tone['desc_' + t] = articles_tone.raw_desc_tone.map(lambda d: tone.extract_score(d, t))
    articles_tone['full_text_' + t] = articles_tone.raw_text_tone.map(lambda d: tone.extract_score(d, t))

# Persist

In [269]:
articles_tone.to_csv('../data/interim/candidate_articles_enriched.csv')

# Sentence Data

In [270]:
def process_and_add_sentence_row(records, raw_row):
    if not raw_row['raw_text_tone'].get('sentences_tone'):
        records.append({'url': raw_row['url'], 'query': raw_row['query']})
        return
    
    for sent_data in raw_row['raw_text_tone']['sentences_tone']:
        sent_row = {'url': raw_row['url'], 'query': raw_row['query']}
        sent_row.update(sent_data)
        for t in ['analytical', 'anger', 'confident', 'fear', 'joy', 'sadness', 'tentative']:
            sent_row[t + '_score'] = tone.extract_score_from_tones(sent_row['tones'], t) 
        del sent_row['tones']
        records.append(sent_row)

In [None]:
sentence_records = []
articles_tone.apply(lambda r: process_and_add_sentence_row(sentence_records, r), axis=1)
df_sentences = pd.DataFrame.from_records(sentence_records)

In [293]:
df_sentences.to_csv('../data/raw/articles_sentences.csv')