In [1]:
#!pip install html5lib
#!pip install ipython-sql
#!pip install nltk
#!pip install pysentiment2

import requests
from   bs4 import BeautifulSoup
import html5lib
import time
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import dtale
import sqlalchemy
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()
import pysentiment2 as ps
import nltk
import re
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import seaborn as sns

In [2]:
url = "https://www.bild.de/politik/international/bild-international/home-44225950.bild.html"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [3]:
bild = BeautifulSoup(content.content, 'html.parser')

headlines = bild.find_all('h3')

df_bild = pd.DataFrame([i.text for i in headlines], columns=["bild"]).replace("\n", "", regex=True).replace("\t", "", regex=True)
print (df_bild.size, df_bild.head())

25                                                 bild
0  INTERNATIONAL HOLOCAUST REMEMBRANCE DAY :  How...
1  BILD exclusive :  „Putin knows that Navalny is...
2  BILD EXCLUSIVE :  With this letter, the vaccin...
3  Interview with the US Ambassador to Germany : ...
4  Fiancé of Jamal Khashoggi :  “I’m still waitin...


In [4]:
url = "https://www.spiegel.de/international/"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [5]:
spiegel = BeautifulSoup(content.content, 'html.parser')

headlines = spiegel.find_all(attrs={'class':'align-middle'})

df_spiegel = pd.DataFrame([i.text for i in headlines], columns=["spiegel"])
print (df_spiegel.size, df_spiegel.head())

31                                              spiegel
0  Deploying Codewords and Fake Online Shops agai...
1  "It Appears that Bribe Money Is Coming from Ge...
2  "We Need To Stand for Freedom and Openness – A...
3  Croatia Conducts Illegal Pushbacks of Vulnerab...
4  Hopes Are High for the Technology that Is Lead...


In [6]:
url = "https://www.zeit.de/english/index"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [7]:
zeit = BeautifulSoup(content.content, 'html.parser')

headlines = zeit.find_all(attrs={'class':'zon-teaser-standard__title'})

df_zeit = pd.DataFrame([i.text for i in headlines], columns=["zeit"])
print (df_zeit.size, df_zeit.head())

20                                                 zeit
0  "A Finance Minister Merz would be a systemic r...
1                                    Only In Germany
2                   The Europeanization of Football 
3                            The Fairy Tale Grandson
4                                Americans, go home?


In [8]:
url = "https://www.faz.net/english/"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [9]:
faz = BeautifulSoup(content.content, 'html.parser')

headlines = faz.find_all(attrs={'class':'tsr-Base_HeadlineText'})

df_faz = pd.DataFrame([i.text for i in headlines], columns=["faz"]).replace("\n", "", regex=True).replace("\t", "", regex=True)
print (df_faz.size, df_faz.head())

20                                                  faz
0            „We can’t do business as usual“        
1  „We have to fight misinformation because peopl...
2                  Do not ruin the Internet!        
3                              The Black Axe        
4  Time for a German-British Friendship Treaty   ...


In [10]:
url = "https://www.bbc.com/news"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [11]:
bbc = BeautifulSoup(content.content, 'html.parser')

headlines = bbc.find_all('h3')

df_bbc = pd.DataFrame([i.text for i in headlines], columns=["bbc"])
print (df_bbc.size, df_bbc.head())

54                                                  bbc
0          Dozens dead amid historic Canada heatwave
1          Dozens dead amid historic Canada heatwave
2   Letter warned residents of damage to Miami block
3  Half of Australians in lockdown amid vaccine c...
4  Ethiopian rebels gain more ground in war-torn ...


In [12]:
url = "https://www.cnn.com/business"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [13]:
cnn = BeautifulSoup(content.content, 'html.parser')

headlines = cnn.find_all(attrs={'class': 'cd__headline'})

df_cnn = pd.DataFrame([i.text for i in headlines], columns=["cnn"])
print (df_cnn.size, df_cnn.head())

35                                                  cnn
0  China is facing its worst power shortage in a ...
1  Big Tech is booming again, and the bull run ap...
2  Elon Musk touts SpaceX internet growth, says g...
3  US government requires automakers to report dr...
4  First-year analysts at JPMorgan will now make ...


In [14]:
url = "https://www.cnbc.com"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [15]:
cnbc = BeautifulSoup(content.content, 'html.parser')

headlines = cnbc.find_all(attrs={'class':'Card-title'})

df_cnbc = pd.DataFrame([i.text for i in headlines], columns=["cnbc"])
print (df_cnbc.size, df_cnbc.head())

34                                                 cnbc
0  Warren Buffett says the pandemic has had an 'e...
1  U.S. stock futures are little changed as the m...
2  Stocks should add to gains in the second half,...
3  A key indicator shows we are past peak inflati...
4  Goldman Sachs picks 12 global stocks to buy ah...


In [16]:
url = "https://www.nytimes.com/section/world"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [17]:
nyt_w = BeautifulSoup(content.content, 'html.parser')

headlines = nyt_w.find_all(name="h2")

df_nyt_w = pd.DataFrame([i.text for i in headlines], columns=["nyt_w"]).iloc[1:].reset_index(drop=True)
print (df_nyt_w.size, df_nyt_w.head())

33                                                nyt_w
0  ‘A Form of Brainwashing’: China Remakes Hong Kong
1  Jubilant Tigray Capital Greets Insurgents Afte...
2  North Korea reports a ‘great crisis’ in its Co...
3  Security in Afghanistan Is Decaying, U.S. Gene...
4  South African Court Orders Arrest of Ex-Presid...


In [18]:
url = "https://www.nytimes.com/section/politics"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [19]:
nyt_p = BeautifulSoup(content.content, 'html.parser')

headlines = nyt_p.find_all(name="h2")

df_nyt_p = pd.DataFrame([i.text for i in headlines], columns=["nyt_p"]).iloc[1:].reset_index(drop=True)
print (df_nyt_p.size, df_nyt_p.head())

18                                                nyt_p
0  Wisconsin G.O.P. Wrestles With Just How Much t...
1  No. 3 House Democrat Steps Into Ohio Race to H...
2  House Passes Bill to Speed Visas for Afghans I...
3  Biden Faces Intense Cross Currents in Iran Policy
4  Fed Unity Cracks as Inflation Rises and Offici...


In [20]:
url = "https://www.nytimes.com/section/business"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [21]:
nyt_b = BeautifulSoup(content.content, 'html.parser')

headlines = nyt_b.find_all(name="h2")

df_nyt_b = pd.DataFrame([i.text for i in headlines], columns=["nyt_b"]).iloc[1:].reset_index(drop=True)
print (df_nyt_b.size, df_nyt_b.head())

40                                                nyt_b
0  Congress Faces Renewed Pressure to ‘Modernize ...
1  Lower Rents? Check. Speakeasy? Check. How Offi...
2  Minor League Parks, Stripped of America’s Past...
3  Fox News agrees to a $1 million penalty after ...
4  Boom Times for Lawyers as Washington Pursues B...


In [22]:
url = "https://www.nytimes.com/section/technology"

content = requests.get(url = url, headers = {'Accept': 'text/html'}, timeout=(3, 27))

content.status_code

200

In [23]:
nyt_t = BeautifulSoup(content.content, 'html.parser')

headlines = nyt_t.find_all(name="h2")

df_nyt_t = pd.DataFrame([i.text for i in headlines], columns=["nyt_t"]).iloc[1:].reset_index(drop=True)
print (df_nyt_t.size, df_nyt_t.head())

23                                                nyt_t
0  Boom Times for Lawyers as Washington Pursues B...
1  Judge Throws Out 2 Antitrust Cases Against Fac...
2                       What Won’t the Nelk Boys Do?
3                   Apple’s Strategy Bends the World
4                                Personal Technology


In [24]:
nltk.download("vader_lexicon")

def get_score(row):
    sid=SIA()
    score=sid.polarity_scores(row)
    return score["compound"]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jonas/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [25]:
news = ["bild", "spiegel", "zeit", "faz", "bbc", "cnn", "cnbc", "nyt_w", "nyt_p", "nyt_b", "nyt_t"]
for k in news:
    exec(f'{k}_score = df_{k}["{k}"].apply(get_score)')

But because the number of Articles differ from news outlet to news outlet, we will only include the more robust mean of our scores:

In [26]:
for k in news:
    exec(f'vader_{k} = {k}_score.mean()') 

Now let's define our new models

In [27]:
hiv4 = ps.HIV4()
lm = ps.LM()

In [28]:
def get_pyscore(row,model):
    return pd.Series(model.get_score(model.tokenize(text=row)))

Now the other models, lets start with hiv4

In [29]:
for k in news:
    exec(f'{k}_score_HV = df_{k}["{k}"].apply(get_pyscore, model=hiv4).drop(["Positive", "Negative"], axis=1)') 
    exec(f'{k}_score_HV.columns = ["Polarity_HV_{k}", "Subjectivity_HV_{k}"]')

Now the lm model

In [30]:
for k in news:
    exec(f'{k}_score_LM = df_{k}["{k}"].apply(get_pyscore, model=lm).drop(["Positive", "Negative"], axis=1)') 
    exec(f'{k}_score_LM.columns = ["Polarity_LM_{k}", "Subjectivity_LM_{k}"]') 

Now lets put it together and calculate the mean

In [31]:
for k in news:
    exec(f'{k}_scores = pd.concat([{k}_score_HV, {k}_score_LM], axis=1)') 
    exec(f'{k}_scores = {k}_scores.mean().to_frame().T')

Now lets put all of our analysis together

In [32]:
for k in news:
    exec(f'{k}_scores.insert(loc=0, column="vader_{k}", value = vader_{k})')

Lets store it in a Data Frame

In [33]:
daily_scores = pd.DataFrame()
for k in news:
    exec(f'daily_scores = pd.concat([daily_scores, {k}_scores], axis=1, sort=False)')

And we add a Timestamp

In [34]:
n = pd.to_datetime('today').strftime("%m/%d/%Y")
n

'06/30/2021'

In [35]:
daily_scores.insert(loc=0, column='Timestamp', value=n)

In [36]:
daily_scores

Unnamed: 0,Timestamp,vader_bild,Polarity_HV_bild,Subjectivity_HV_bild,Polarity_LM_bild,Subjectivity_LM_bild,vader_spiegel,Polarity_HV_spiegel,Subjectivity_HV_spiegel,Polarity_LM_spiegel,...,vader_nyt_b,Polarity_HV_nyt_b,Subjectivity_HV_nyt_b,Polarity_LM_nyt_b,Subjectivity_LM_nyt_b,vader_nyt_t,Polarity_HV_nyt_t,Subjectivity_HV_nyt_t,Polarity_LM_nyt_t,Subjectivity_LM_nyt_t
0,06/30/2021,-0.11868,-0.266666,0.282889,-0.32,0.107397,-0.038232,-0.016129,0.252534,-0.290322,...,0.017873,0.15,0.255833,-0.125,0.066458,0.041843,2.17391e-08,0.219738,0.086956,0.091718


In [37]:
daily_scores.to_csv('daily_scores.csv', mode='a', header=False)