<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1">Load data</a></span></li><li><span><a href="#Create-website-data" data-toc-modified-id="Create-website-data-2">Create website data</a></span></li><li><span><a href="#Web-scrape-news-articles" data-toc-modified-id="Web-scrape-news-articles-3">Web scrape news articles</a></span></li><li><span><a href="#Generate-sentiment-scores" data-toc-modified-id="Generate-sentiment-scores-4">Generate sentiment scores</a></span></li><li><span><a href="#Add-scores-to-data" data-toc-modified-id="Add-scores-to-data-5">Add scores to data</a></span></li></ul></div>

- git lfs install
- git clone https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
from transformers import pipeline

## Load data

In [2]:
path = r'/Users/TyPainter1/Desktop/Masters/spring-2022/capstone/00-data/gdelt_data/'
years = os.listdir(path)
years.remove('.DS_Store')
yr = [x for x in years if not (x.startswith('.'))]

year_paths= []
months=[]
file_paths= []

 

for i in range(len(years)): # enter year folder
    year_paths.append(path + years[i]) # year paths
    months.extend(os.listdir(year_paths[i])) # months of data in each year
months = [x for x in months if not (x.startswith('.'))]
for j in range(len(months)): # enter months in each year folder
    file_paths.append(year_paths[i] + "/" + months[j]) # file in year path

In [3]:
file_paths

['/Users/TyPainter1/Desktop/Masters/spring-2022/capstone/00-data/gdelt_data/2021/gdelt_mar2021.csv',
 '/Users/TyPainter1/Desktop/Masters/spring-2022/capstone/00-data/gdelt_data/2021/gdelt_jan2021.csv']

In [4]:
for path in file_paths:
    df = pd.read_csv(path, index_col=[0]) # read in csv

## Create website data

In [5]:
yahoo_df = df[df.website=='yahoo.com'] # filter for website
yahoo_df 

Unnamed: 0,year,month,day,website,url
2,2021,1,1,yahoo.com,https://news.yahoo.com/cheektowaga-police-offi...
3,2021,1,1,yahoo.com,https://news.yahoo.com/casey-anthony-files-pap...
5,2021,1,1,yahoo.com,https://news.yahoo.com/person-killed-wilmingto...
6,2021,1,1,yahoo.com,https://news.yahoo.com/annual-philadelphia-yea...
7,2021,1,1,yahoo.com,https://news.yahoo.com/hawley-faces-heat-senat...
...,...,...,...,...,...
101517,2021,1,31,yahoo.com,https://news.yahoo.com/1-30-2021-forecast-2215...
101518,2021,1,31,yahoo.com,https://news.yahoo.com/ap-top-stories-january-...
101519,2021,1,31,yahoo.com,https://news.yahoo.com/wind-picks-slushy-condi...
101521,2021,1,31,yahoo.com,https://news.yahoo.com/accuweather-periods-sno...


In [6]:
yahoo_url = yahoo_df.url[0:10]
yahoo_url

2     https://news.yahoo.com/cheektowaga-police-offi...
3     https://news.yahoo.com/casey-anthony-files-pap...
5     https://news.yahoo.com/person-killed-wilmingto...
6     https://news.yahoo.com/annual-philadelphia-yea...
7     https://news.yahoo.com/hawley-faces-heat-senat...
8     https://news.yahoo.com/covid-19-wisconsin-3-81...
9     https://news.yahoo.com/amazon-fire-tv-india-23...
10    https://news.yahoo.com/whos-caring-caregivers-...
11    https://news.yahoo.com/red-hot-buccaneers-not-...
12    https://news.yahoo.com/walt-disney-dis-gains-l...
Name: url, dtype: object

## Web scrape news articles

In [7]:
# for i in range(len(yahoo_url)): #filter_urls
#     URL = yahoo_url.iloc[i] # filter_urls
#     page = requests.get(URL,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
#     soup = BeautifulSoup(page.content, "html.parser")
#     print(URL)
#     try:
#         try:
#             master = soup.find(id="Masterwrap")
#             final = master.find("div", class_="caas-body")
#         except AttributeError:
#             master = soup.find(id="Masterwrap2Col")
#             final = master.find("div", class_="caas-body")
#     except AttributeError:
#         #ls.append(URL)
#         pass
#     try:
#         text = re.sub('<[^>]+>', '', str(final))
#         print(text)
#     except:
#         pass

In [8]:
# sen_list = text.split(".")
# sen_list

## Generate sentiment scores

In [9]:
roberta = pipeline(task='sentiment-analysis', 
                   model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
                   return_all_scores=True)

In [10]:
def summary_stats(score):
    avg = np.mean(score)
    med = np.median(score)
    min = np.min(score)
    max = np.max(score)
    return(avg, med, min, max)

In [11]:
# que = ""
# char_len = 0
# pos = []
# neg = []
# neu = []
# for sen in sen_list:
#     if char_len + len(sen) < 514:
#         que = que + sen
#         char_len = char_len + len(sen)
#     else:
#         output = roberta(que)
#         neg.append(output[0][0]['score'])
#         neu.append(output[0][1]['score'])
#         pos.append(output[0][2]['score'])
#         que = ""
#         que = sen
#         char_len = len(sen)

In [12]:
# summary_stats(neg)

## Add scores to data

In [13]:
yahoo_df["pos_mean"] = np.nan
yahoo_df["pos_median"] = np.nan
yahoo_df["pos_min"] = np.nan
yahoo_df["pos_max"] = np.nan

yahoo_df["neg_mean"] = np.nan
yahoo_df["neg_median"] = np.nan
yahoo_df["neg_min"] = np.nan
yahoo_df["neg_max"] = np.nan

yahoo_df["neu_mean"] = np.nan
yahoo_df["neu_median"] = np.nan
yahoo_df["neu_min"] = np.nan
yahoo_df["neu_max"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yahoo_df["pos_mean"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yahoo_df["pos_median"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yahoo_df["pos_min"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value

In [15]:
nf = "Content is currently unavailable"
for i in range(len(yahoo_url)): #filter_urls
    URL = yahoo_url.iloc[i] # filter_urls
    page = requests.get(URL,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
    soup = BeautifulSoup(page.content, "html.parser")
    if nf in soup.prettify():
            continue
    try:
        try:
            master = soup.find(id="Masterwrap")
            final = master.find("div", class_="caas-body")
        except AttributeError:
            master = soup.find(id="Masterwrap2Col")
            final = master.find("div", class_="caas-body")
    except AttributeError:
        pass
    text = re.sub('<[^>]+>', '', str(final))
    sen_list = text.split(".")
    
    que = ""
    char_len = 0
    pos = []
    neg = []
    neu = []

    for sen in sen_list:
        if sen_list.index(sen) == len(sen_list)-1:
            output = roberta(que)
            neg.append(output[0][0]['score'])
            neu.append(output[0][1]['score'])
            pos.append(output[0][2]['score'])
            que = ""
            que = sen
            char_len = len(sen)
        elif char_len + len(sen) < 514:
            que = que + sen
            char_len = char_len + len(sen)
        else:
            output = roberta(que)
            neg.append(output[0][0]['score'])
            neu.append(output[0][1]['score'])
            pos.append(output[0][2]['score'])
            que = ""
            que = sen
            char_len = len(sen)
            
    yahoo_df.pos_mean.iloc[i], yahoo_df.pos_median.iloc[i], yahoo_df.pos_min.iloc[i], yahoo_df.pos_max.iloc[i] = summary_stats(pos)
    yahoo_df.neg_mean.iloc[i], yahoo_df.neg_median.iloc[i], yahoo_df.neg_min.iloc[i], yahoo_df.neg_max.iloc[i] = summary_stats(neg)
    yahoo_df.neu_mean.iloc[i], yahoo_df.neu_median.iloc[i], yahoo_df.neu_min.iloc[i], yahoo_df.neu_max.iloc[i] = summary_stats(neu)
    
yahoo_df[0:10]

Unnamed: 0,year,month,day,website,url,pos_mean,pos_median,pos_min,pos_max,neg_mean,neg_median,neg_min,neg_max,neu_mean,neu_median,neu_min,neu_max
2,2021,1,1,yahoo.com,https://news.yahoo.com/cheektowaga-police-offi...,,,,,,,,,,,,
3,2021,1,1,yahoo.com,https://news.yahoo.com/casey-anthony-files-pap...,5.8e-05,5.8e-05,5.8e-05,5.8e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,0.999858,0.999858,0.999858,0.999858
5,2021,1,1,yahoo.com,https://news.yahoo.com/person-killed-wilmingto...,,,,,,,,,,,,
6,2021,1,1,yahoo.com,https://news.yahoo.com/annual-philadelphia-yea...,,,,,,,,,,,,
7,2021,1,1,yahoo.com,https://news.yahoo.com/hawley-faces-heat-senat...,0.06818,0.001527,0.00016,0.663731,0.373567,0.046289,7e-05,0.997734,0.558253,0.633886,0.001246,0.99977
8,2021,1,1,yahoo.com,https://news.yahoo.com/covid-19-wisconsin-3-81...,0.053313,0.053313,0.053313,0.053313,0.03868,0.03868,0.03868,0.03868,0.908007,0.908007,0.908007,0.908007
9,2021,1,1,yahoo.com,https://news.yahoo.com/amazon-fire-tv-india-23...,6.6e-05,6.2e-05,4.6e-05,9e-05,6.3e-05,6.4e-05,5e-05,7.6e-05,0.999871,0.999888,0.999834,0.99989
10,2021,1,1,yahoo.com,https://news.yahoo.com/whos-caring-caregivers-...,5.8e-05,5.8e-05,5.8e-05,5.8e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,0.999858,0.999858,0.999858,0.999858
11,2021,1,1,yahoo.com,https://news.yahoo.com/red-hot-buccaneers-not-...,0.36107,0.016525,0.000402,0.999422,0.208256,0.000414,0.000199,0.997733,0.430675,0.189515,0.000365,0.99933
12,2021,1,1,yahoo.com,https://news.yahoo.com/walt-disney-dis-gains-l...,0.65096,0.896764,0.000112,0.999522,0.190874,0.000621,6.5e-05,0.998813,0.158166,0.001411,8e-05,0.999822
