### Import some system libs

In [17]:
import os
import sys
import plotly.subplots as sp
import plotly.express as px
import pandas as pd

#### Load the dataset using the local .env 

In [3]:
sys.path.append('../scripts/')

from dotenv import load_dotenv
from load_data import load_csv
load_dotenv()
finance_data = os.getenv('FINANCIAL_NEWS')

finance_df = load_csv(finance_data)
if finance_df is not None:
    print(f"Data loaded successfully with {len(finance_df)} records.")

Data loaded successfully with 1407328 records.


In [4]:
print('Financial News Data:')
print(finance_df.head())
print(finance_df.columns)
print(finance_df.info())

Financial News Data:
   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-

### Descriptive Statistics on Data

In [5]:
finance_df['headline_length'] = finance_df['headline'].apply(len)
print('Headline Lengths:')
finance_df['headline_length'].describe()

Headline Lengths:


count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64

In [6]:
## show minimum headline length
print(finance_df[finance_df['headline_length'] == 3])


         Unnamed: 0 headline  \
530954       533833      SPY   
530955       533834      SRS   
602684       605830      J.P   
678958       682383      AMJ   
912819       917537      J.P   
984798       989747      PCP   
1047479     1052673      QID   
1179797     1185514      SPY   
1191387     1197147      SUN   
1191388     1197148      SUN   
1228292     1234142      TJX   
1358108     1364530      WNR   

                                                       url  \
530954   https://www.benzinga.com/bullbeartrades/2009/7...   
530955   https://www.benzinga.com/bullbeartrades/2009/7...   
602684   https://www.benzinga.com/analyst-ratings/analy...   
678958     https://www.benzinga.com/news/14/08/4749350/amj   
912819   https://www.benzinga.com/analyst-ratings/analy...   
984798                   https://www.benzinga.com/6396/pcp   
1047479                  https://www.benzinga.com/4745/qid   
1179797  https://www.benzinga.com/bullbeartrades/2009/7...   
1191387                  

In [9]:
#%pip install nbformat

fig = px.histogram(finance_df, x='headline_length', nbins=30, title='Distribution of Headline Lengths')
fig.show()

##### Let us count the number of articles per publisher which is to identify which publishers are most active

In [16]:
#The number of articles per publisher
publisher_count = finance_df['publisher'].value_counts()

fig = px.bar(publisher_count, x=publisher_count.index, y=publisher_count.values, height=500,
               labels={'x': 'Publisher', 'y': 'Number of Articles'},
               title='Number of Articles per Publisher')
fig.update_layout(xaxis={"categoryorder": "total descending"})
fig.show()

In [18]:
### publication dates to see trends over time
finance_df['date'] = pd.to_datetime(finance_df['date'], format='ISO8601', utc=True)
finance_df['date'] = finance_df['date'].dt.date
finance_df['date'].value_counts().sort_index()

date
2009-02-14      1
2009-04-27      2
2009-04-29      1
2009-05-22      1
2009-05-27      6
             ... 
2020-06-07     25
2020-06-08    765
2020-06-09    803
2020-06-10    807
2020-06-11    544
Name: count, Length: 3955, dtype: int64

In [20]:
finance_df.describe

<bound method NDFrame.describe of          Unnamed: 0                                           headline  \
0                 0            Stocks That Hit 52-Week Highs On Friday   
1                 1         Stocks That Hit 52-Week Highs On Wednesday   
2                 2                      71 Biggest Movers From Friday   
3                 3       46 Stocks Moving In Friday's Mid-Day Session   
4                 4  B of A Securities Maintains Neutral on Agilent...   
...             ...                                                ...   
1407323     1413844             Top Narrow Based Indexes For August 29   
1407324     1413845  Recap: Wednesday's Top Percentage Gainers and ...   
1407325     1413846  UPDATE: Oppenheimer Color on China Zenix Auto ...   
1407326     1413847  Oppenheimer Initiates China Zenix At Outperfor...   
1407327     1413848  China Zenix Auto International Opens For Tradi...   

                                                       url          publisher