<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1">Load data</a></span></li><li><span><a href="#Create-website-data" data-toc-modified-id="Create-website-data-2">Create website data</a></span></li><li><span><a href="#Web-scrape-news-articles" data-toc-modified-id="Web-scrape-news-articles-3">Web scrape news articles</a></span></li><li><span><a href="#Generate-sentiment-scores" data-toc-modified-id="Generate-sentiment-scores-4">Generate sentiment scores</a></span></li><li><span><a href="#Add-scores-to-data" data-toc-modified-id="Add-scores-to-data-5">Add scores to data</a></span></li></ul></div>

- git lfs install
- git clone https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

In [41]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
from transformers import pipeline

## Load data

In [42]:
path = r'/Users/TyPainter1/Desktop/Masters/spring-2022/capstone/00-data/gdelt_data/'
years = os.listdir(path)
years.remove('.DS_Store')
yr = [x for x in years if not (x.startswith('.'))]

year_paths= []
months=[]
file_paths= []

 

for i in range(len(years)): # enter year folder
    year_paths.append(path + years[i]) # year paths
    months.extend(os.listdir(year_paths[i])) # months of data in each year
months = [x for x in months if not (x.startswith('.'))]
for j in range(len(months)): # enter months in each year folder
    file_paths.append(year_paths[i] + "/" + months[j]) # file in year path

In [43]:
file_paths

['/Users/TyPainter1/Desktop/Masters/spring-2022/capstone/00-data/gdelt_data/2021/gdelt_mar2021.csv',
 '/Users/TyPainter1/Desktop/Masters/spring-2022/capstone/00-data/gdelt_data/2021/gdelt_jan2021.csv']

In [44]:
for path in file_paths:
    df = pd.read_csv(path, index_col=[0]) # read in csv

## Create website data

In [45]:
prnw_df = df[df.website=='prnewswire.com'] # filter for website
prnw_df 

Unnamed: 0,year,month,day,website,url
48,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/s...
57,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/c...
67,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/burj-...
68,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/conta...
70,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/p...
...,...,...,...,...,...
101418,2021,1,31,prnewswire.com,https://www.prnewswire.com:443/news-releases/l...
101424,2021,1,31,prnewswire.com,https://www.prnewswire.com:443/news-releases/i...
101434,2021,1,31,prnewswire.com,https://www.prnewswire.com/news-releases/edwar...
101475,2021,1,31,prnewswire.com,https://www.prnewswire.com:443/news-releases/e...


In [46]:
prnw_url = prnw_df.url.iloc[0:10]
prnw_url

48    https://www.prnewswire.com:443/news-releases/s...
57    https://www.prnewswire.com:443/news-releases/c...
67    https://www.prnewswire.com/news-releases/burj-...
68    https://www.prnewswire.com/news-releases/conta...
70    https://www.prnewswire.com:443/news-releases/p...
72    https://www.prnewswire.com/news-releases/share...
76    https://www.prnewswire.com/news-releases/kaske...
78    https://www.prnewswire.com:443/news-releases/b...
79    https://www.prnewswire.com/news-releases/chang...
82    https://www.prnewswire.com/news-releases/body-...
Name: url, dtype: object

## Web scrape news articles

In [1]:
for i in range(len(prnw_url)): #filter_urls
    URL = prnw_url.iloc[i] # filter_urls
    page = requests.get(URL,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
    soup = BeautifulSoup(page.content, "html.parser")
    print(URL)
    try:
        master = soup.find(id="main")
        final = master.find("div", class_="col-sm-10 col-sm-offset-1")
    except AttributeError:
        pass
    try:
        text = final.text
        print(text)
    except:
        pass
    

NameError: name 'prnw_url' is not defined

In [48]:
sen_list = text.split(".")
sen_list

['\nVANCOUVER, BC, Dec',
 ' 31, 2020 /PRNewswire/ -\xa0Body and Mind Inc',
 ' (CSE: BAMM) (OTCQB: BMMJ) (the "Company" or "BaM") wishes to announce that its interim financial statements for the three month period ended October 31, 2020, including the related management discussion and analysis, and CEO and CFO certifications (collectively, the "October 31, 2020 Interim Financial Filings") were not filed as required under Canadian securities legislation by the required filing deadline of December 30, 2020',
 '\nThe Company has determined that it was not able to meet the December 30, 2020 filing deadline (the "Filing Deadline") under Canadian securities legislation for the October 31, 2020 Interim Financial Filings',
 ' Although the review process is progressing, the Company anticipates\xa0some delays in completing the review process, such that the Company is not able to meet the Filing Deadline for the October 31, 2020 Interim Financial Filings',
 ' The Company is taking proactive measur

## Generate sentiment scores

In [49]:
roberta = pipeline(task='sentiment-analysis', 
                   model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
                   return_all_scores=True)

In [50]:
def summary_stats(score):
    avg = np.mean(score)
    med = np.median(score)
    min = np.min(score)
    max = np.max(score)
    return(avg, med, min, max)

In [51]:
que = ""
char_len = 0
pos = []
neg = []
neu = []
for sen in sen_list:
    if char_len + len(sen) < 514:
        que = que + sen
        char_len = char_len + len(sen)
    else:
        output = roberta(que)
        neg.append(output[0][0]['score'])
        neu.append(output[0][1]['score'])
        pos.append(output[0][2]['score'])
        que = ""
        que = sen
        char_len = len(sen)

In [52]:
summary_stats(neg)

(0.09931060317057927,
 0.00014538399409502745,
 6.718449003528804e-05,
 0.9825238585472107)

## Add scores to data

In [55]:
prnw_df["pos_mean"] = np.nan
prnw_df["pos_median"] = np.nan
prnw_df["pos_min"] = np.nan
prnw_df["pos_max"] = np.nan

prnw_df["neg_mean"] = np.nan
prnw_df["neg_median"] = np.nan
prnw_df["neg_min"] = np.nan
prnw_df["neg_max"] = np.nan

prnw_df["neu_mean"] = np.nan
prnw_df["neu_median"] = np.nan
prnw_df["neu_min"] = np.nan
prnw_df["neu_max"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prnw_df["pos_mean"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prnw_df["pos_median"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prnw_df["pos_min"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

In [56]:
nf = "Content is currently unavailable"
for i in range(len(prnw_url)): #filter_urls
    URL = prnw_url.iloc[i] # filter_urls
    page = requests.get(URL,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
    soup = BeautifulSoup(page.content, "html.parser")
    if nf in soup.prettify():
            continue
    try:
        master = soup.find(id="main")
        final = master.find("div", class_="col-sm-10 col-sm-offset-1")
    except AttributeError:
        pass
    try:
        text = final.text
    except:
        pass
    text = re.sub('<[^>]+>', '', str(final))
    sen_list = text.split(".")
    
    que = ""
    char_len = 0
    pos = []
    neg = []
    neu = []

    for sen in sen_list:
        if sen_list.index(sen) == len(sen_list)-1:
            output = roberta(que)
            neg.append(output[0][0]['score'])
            neu.append(output[0][1]['score'])
            pos.append(output[0][2]['score'])
            que = ""
            que = sen
            char_len = len(sen)
        elif char_len + len(sen) < 514:
            que = que + sen
            char_len = char_len + len(sen)
        else:
            output = roberta(que)
            neg.append(output[0][0]['score'])
            neu.append(output[0][1]['score'])
            pos.append(output[0][2]['score'])
            que = ""
            que = sen
            char_len = len(sen)
            
    prnw_df.pos_mean.iloc[i], prnw_df.pos_median.iloc[i], prnw_df.pos_min.iloc[i], prnw_df.pos_max.iloc[i] = summary_stats(pos)
    prnw_df.neg_mean.iloc[i], prnw_df.neg_median.iloc[i], prnw_df.neg_min.iloc[i], prnw_df.neg_max.iloc[i] = summary_stats(neg)
    prnw_df.neu_mean.iloc[i], prnw_df.neu_median.iloc[i], prnw_df.neu_min.iloc[i], prnw_df.neu_max.iloc[i] = summary_stats(neu)
    
prnw_df[0:10]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,year,month,day,website,url,pos_mean,pos_median,pos_min,pos_max,neg_mean,neg_median,neg_min,neg_max,neu_mean,neu_median,neu_min,neu_max
48,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/s...,0.001302,5.2e-05,4.5e-05,0.005061,0.000106,7.1e-05,6.4e-05,0.000216,0.998592,0.999877,0.994723,0.999891
57,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/c...,0.140978,0.000292,5.1e-05,0.999242,0.198036,7.3e-05,4.9e-05,0.996493,0.660986,0.999405,0.00057,0.999889
67,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/burj-...,0.000987,0.000987,0.000987,0.000987,0.00011,0.00011,0.00011,0.00011,0.998903,0.998903,0.998903,0.998903
68,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/conta...,0.228429,0.001488,5.2e-05,0.999484,0.052006,8.5e-05,5.5e-05,0.36304,0.719565,0.99845,0.000311,0.99989
70,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/p...,0.1664,8.5e-05,4.6e-05,0.999246,0.0292,6.7e-05,5.1e-05,0.78103,0.804401,0.999831,0.000162,0.999893
72,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/share...,0.001302,5.2e-05,4.5e-05,0.005061,0.000106,7.1e-05,6.4e-05,0.000216,0.998592,0.999877,0.994723,0.999891
76,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/kaske...,0.000579,5.4e-05,4.9e-05,0.002241,0.38473,0.000108,6.2e-05,0.998408,0.614691,0.999842,0.001091,0.999889
78,2021,1,1,prnewswire.com,https://www.prnewswire.com:443/news-releases/b...,0.000987,0.000987,0.000987,0.000987,0.00011,0.00011,0.00011,0.00011,0.998903,0.998903,0.998903,0.998903
79,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/chang...,0.140978,0.000292,5.1e-05,0.999242,0.198036,7.3e-05,4.9e-05,0.996493,0.660986,0.999405,0.00057,0.999889
82,2021,1,1,prnewswire.com,https://www.prnewswire.com/news-releases/body-...,0.117697,6.4e-05,3.5e-05,0.999469,0.093473,0.000135,6.7e-05,0.982524,0.78883,0.999801,0.000376,0.999892
