In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
data=pd.read_csv("descriptive-statistics.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,headline-length,year,month,day,day_of_week,time
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7,2020,6,5,Friday,10:30:54
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7,2020,6,3,Wednesday,10:45:20
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5,2020,5,26,Tuesday,04:30:07
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7,2020,5,22,Friday,12:45:06
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14,2020,5,22,Friday,11:38:59


In [15]:
# Count the number of articles per publisher
publishers = data['publisher'].value_counts()

### Counting the number of articles per publisher allows us to understand the influence and reach of various news sources within our dataset. This analysis helps identify which publishers are contributing the most content, offering insights into potential biases or dominant perspectives. By examining the distribution of articles, we can also ensure a balanced representation when analyzing the relationship between news sentiment and stock market trends.

In [17]:
publishers.head(10)
# Display top 10 publishers

publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64

In [24]:
def categorize_news_type(headline):
    headline = headline.lower()

    if any(keyword in headline for keyword in ["earnings", "session", "day"]):
        return "Earnings Reports and Market Sessions"
    elif any(keyword in headline for keyword in ["update", "mid morning", "announce", "shares"]):
        return "Market Updates and Share Announcements"
    elif any(keyword in headline for keyword in ["share", "trade", "company", "estimate", "lower", "beat", "report"]):
        return "Company Financial Performance"
    elif any(keyword in headline for keyword in ["stock target", "price target", "buy", "raises", "auto", "morgan"]):
        return "Stock Price Targets and Analyst Ratings"
    elif any(keyword in headline for keyword in ["eps", "reports", "pt", "sales", "adj", "initiates buy"]):
        return "Earnings per Share (EPS) and Financial Reports"
    elif any(keyword in headline for keyword in ["update", "morning", "alcoa", "mid", "shares", "market"]):
        return "Morning Market Updates and Announcements"
    elif any(keyword in headline for keyword in ["share", "trade", "estimate", "lower", "beat", "report"]):
        return "Trading Activity and Performance Estimates"
    elif any(keyword in headline for keyword in ["stock", "price", "week", "buy", "raises"]):
        return "Stock Price Movements and Investment Advice"
    elif any(keyword in headline for keyword in ["eps", "reports", "sales", "initiates buy"]):
        return "Financial Metrics and Investment Initiatives"
    else:
        return "Other"

### The categorize_news_type  function helps classify news headlines into specific categories such as "Earnings," "Finance," "M&A," or "Other." This categorization allows for more targeted analysis of news sentiment and its potential impact on stock market trends, enabling better insights into how different types of news events influence market behavior.

In [38]:
# Apply the categorization function to create a new 'news_type' column
data['news_type'] = data['headline'].apply(categorize_news_type)

### The above line of code  applies the categorize_news_type function to each headline in the dataset, creating a new column, news_type, that categorizes each news article. This enhances the dataset by adding a structured classification of news types, facilitating more detailed and segmented analysis of how different categories of news influence stock market trends.

In [39]:
data

Unnamed: 0,headline,url,publisher,date,stock,headline-length,year,month,day,day_of_week,time,news_type,publisher_domain
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7,2020,6,5,Friday,10:30:54,Earnings Reports and Market Sessions,
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7,2020,6,3,Wednesday,10:45:20,Earnings Reports and Market Sessions,
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5,2020,5,26,Tuesday,04:30:07,Earnings Reports and Market Sessions,
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7,2020,5,22,Friday,12:45:06,Earnings Reports and Market Sessions,
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14,2020,5,22,Friday,11:38:59,Stock Price Targets and Analyst Ratings,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX,7,2011,8,29,Monday,00:00:00,Other,
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX,7,2011,6,22,Wednesday,00:00:00,Earnings Reports and Market Sessions,
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX,8,2011,6,21,Tuesday,00:00:00,Market Updates and Share Announcements,
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX,8,2011,6,21,Tuesday,00:00:00,Earnings per Share (EPS) and Financial Reports,


In [40]:
# Group by publisher and news_type to get counts
news_type_by_publisher = data.groupby(['publisher', 'news_type']).size().unstack(fill_value=0)

### The above line of code  groups the dataset by publisher and news type, then counts the number of articles in each category for every publisher. This creates a structured overview that reveals which publishers focus on specific types of news, providing insights into their editorial priorities and potential biases. This information can be valuable for understanding how different types of news coverage by various publishers impact market sentiment and trends

In [41]:
news_type_by_publisher.sum()
# Display the first 10 rows

news_type
Company Financial Performance                     222955
Earnings Reports and Market Sessions              270604
Earnings per Share (EPS) and Financial Reports     73919
Market Updates and Share Announcements            223773
Morning Market Updates and Announcements           60424
Other                                             380788
Stock Price Movements and Investment Advice        82753
Stock Price Targets and Analyst Ratings            92112
dtype: int64

In [42]:
def extract_domain(email):
    match = re.search(r"@([a-zA-Z0-9.-]+)", email)
    return match.group(1) if match else None


### The extract_domain function identifies the domain of an email address, which can be useful for determining the publisher associated with a specific email. By extracting the domain (e.g., example.com), we can categorize and analyze articles or news submissions by their source publishers. This helps in understanding the distribution of news content among different publishers and in identifying any patterns or biases related to specific publishers.

In [43]:
# Apply the function to extract domains
data['publisher_domain'] = data['publisher'].apply(extract_domain)


### The  above line of code applies the extract_domain function to the 'publisher' column, extracting the domain from each email address. This creates a new column, publisher_domain that represents the specific domain of each publisher's email. By doing this, we can easily group, filter, and analyze data based on publisher domains, which helps in identifying trends, patterns, and potential biases associated with specific publishers.

In [44]:
# Count the number of articles per domain
top_domains = data['publisher_domain'].value_counts()


In [45]:
top_domains.head(100)

publisher_domain
benzinga.com              7937
gmail.com                  139
andyswan.com                 5
investdiva.com               2
tothetick.com                2
eosdetroit.io                1
forextraininggroup.com       1
stockmetrix.net              1
Name: count, dtype: int64

In [47]:
data

Unnamed: 0,headline,url,publisher,date,stock,headline-length,year,month,day,day_of_week,time,news_type,publisher_domain
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7,2020,6,5,Friday,10:30:54,Earnings Reports and Market Sessions,
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7,2020,6,3,Wednesday,10:45:20,Earnings Reports and Market Sessions,
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5,2020,5,26,Tuesday,04:30:07,Earnings Reports and Market Sessions,
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7,2020,5,22,Friday,12:45:06,Earnings Reports and Market Sessions,
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14,2020,5,22,Friday,11:38:59,Stock Price Targets and Analyst Ratings,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX,7,2011,8,29,Monday,00:00:00,Other,
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX,7,2011,6,22,Wednesday,00:00:00,Earnings Reports and Market Sessions,
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX,8,2011,6,21,Tuesday,00:00:00,Market Updates and Share Announcements,
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX,8,2011,6,21,Tuesday,00:00:00,Earnings per Share (EPS) and Financial Reports,


In [50]:
data=data.drop(columns=["year","month","day","day_of_week","time","publisher_domain"],axis=1)

In [52]:
data

Unnamed: 0,headline,url,publisher,date,stock,headline-length,news_type
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7,Earnings Reports and Market Sessions
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7,Earnings Reports and Market Sessions
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5,Earnings Reports and Market Sessions
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7,Earnings Reports and Market Sessions
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14,Stock Price Targets and Analyst Ratings
...,...,...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX,7,Other
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX,7,Earnings Reports and Market Sessions
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX,8,Market Updates and Share Announcements
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX,8,Earnings per Share (EPS) and Financial Reports


In [53]:
data.to_csv("publisher-analyzed.csv")