### Import some system libs

In [None]:
import os
import sys
import plotly.subplots as sp
import plotly.express as px
import pandas as pd
from textblob import TextBlob
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import swifter

#### Load the dataset using the local .env 

In [None]:
sys.path.append('../scripts/')

from dotenv import load_dotenv
from load_data import load_csv
load_dotenv()
finance_data = os.getenv('FINANCIAL_NEWS')
apple_data = os.getenv('APPLE_STOCK')
amazon_data = os.getenv('AMAZON_STOCK')
google_data = os.getenv('GOOGLE_STOCK')
meta_data = os.getenv('META_STOCK')
microsoft_data = os.getenv('MICROSOFT_STOCK')
nvdia_data = os.getenv('NVIDIA_STOCK')
tesla_data = os.getenv('TESLA_STOCK')

finance_df = load_csv(finance_data)
apple_data_df = load_csv(apple_data)
amazon_data_df = load_csv(amazon_data)
google_data_df = load_csv(google_data)
meta_data_df = load_csv(meta_data)
microsoft_data_df = load_csv(microsoft_data)
nvdia_data_df = load_csv(nvdia_data)
tesla_data_df = load_csv(tesla_data)
if finance_df is not None and apple_data_df is not None and amazon_data_df is not None and google_data_df is not None and meta_data_df is not None and microsoft_data_df is not None and nvdia_data_df is not None and tesla_data_df is not None:
    print(f"Data loaded successfully with {len(finance_df)} records.")
    print(f"Data loaded successfully with {len(apple_data_df)} Apple records.")
    print(f"Data loaded successfully with {len(amazon_data_df)} Amazon records.")
    print(f"Data loaded successfully with {len(google_data_df)} Google records.")
    print(f"Data loaded successfully with {len(meta_data_df)} Meta records.")
    print(f"Data loaded successfully with {len(microsoft_data_df)} Microsoft records.")
    print(f"Data loaded successfully with {len(nvdia_data_df)} NVDIA records.")
    print(f"Data loaded successfully with {len(tesla_data_df)} TESLA records.")

In [None]:
print('Financial News Data:')
print(finance_df.head())
print(finance_df.columns)
print(finance_df.info())

### Descriptive Statistics on Data

In [None]:
headline_lengths = finance_df['headline'].apply(len)

In [None]:
# Define bins and labels
bins = [0, 100, 200, float('inf')]
labels = ['0-100', '100-200', '>200']
headline_bins = pd.cut(headline_lengths, bins=bins, labels=labels, right=False)

# Count headlines per bin
bin_counts = headline_bins.value_counts().sort_index()

# Plot
plt.figure(figsize=(8, 5))
bars = plt.bar(bin_counts.index, bin_counts.values, color='skyblue', edgecolor='black')

# Add counts on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f'{int(height)}',
        ha='center',
        va='bottom',
        fontsize=11,
        fontweight='bold'
    )

plt.title('Distribution of Headline Lengths')
plt.xlabel('Headline Length (Characters)')
plt.ylabel('Number of Headlines')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

Most of the length of headlines fall between 0 - 100 which is **1,180,836**, and the second is 100 - 200 is **191690** and only few more than 200 is **34802**

In [None]:
headline_lengths.describe()

In [None]:
## show minimum headline length
finance_df['headline_length'] = finance_df['headline'].apply(len)


In [None]:
finance_df['headline_length']

In [None]:
finance_df['headline_length'].hist(bins=50, figsize=(10, 5))

In [None]:
np.round(finance_df['headline_length'].describe(), 2)

In [None]:
fig = px.histogram(finance_df, x='headline_length', nbins=30, title='Distribution of Headline Lengths')
fig.show()

##### Let us count the number of articles published per date in which date the articles where published most

In [None]:
publisher_counts_per_date = finance_df['date'].value_counts()

publisher_counts_per_date = publisher_counts_per_date.sort_index()
plt.figure(figsize=(12, 6))
publisher_counts_per_date.plot(kind='line', marker='o')
plt.title("Publisher Counts Per Date")
plt.xlabel("Date")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

##### Let us count the number of articles per publisher which is to identify which publishers are most active

In [None]:
#The number of articles per publisher
publisher_count = finance_df['publisher'].value_counts()

fig = px.bar(publisher_count, x=publisher_count.index, y=publisher_count.values, height=2000,
               labels={'x': 'Publisher', 'y': 'Number of Articles'},
               title='Number of Articles per Publisher')
fig.update_layout(xaxis={"categoryorder": "total descending"})
fig.show()

In [None]:
### publication dates to see trends over time
finance_df['date'] = pd.to_datetime(finance_df['date'], format='ISO8601', utc=True)
finance_df['date_x'] = finance_df['date'].dt.date
finance_df['date_x'].value_counts().sort_index()

In [None]:
finance_df['date_x'].head()

In [None]:
print(finance_df['date'].head())
print(finance_df['date'].dtype)

In [None]:
finance_df.describe

In [None]:
finance_df['headline']

In [None]:
finance_df[finance_df['stock']=='AAPL']

#### Text Analysis (Sentiment analysis and Topic modeling)

In [None]:
from sentimental_analysis import get_sentiment_word

In [None]:
print("Apple Stock data")
print(apple_data_df.head())
print("Amazon Stock data")
print(amazon_data_df.head())
print("Google Stock data")
print(google_data_df.head())
print("Meta Stock data")
print(meta_data_df.head())
print("Microsoft Stock data")
print(microsoft_data_df.head())
print("Nvdia Stock data")
print(nvdia_data_df.head())
print("Tesla Stock data")
print(tesla_data_df.head())

## Cleaning the text

#### Filter data frame for only the main stock names apple, microsoft, google, amazon, TESLA, Meta, NVDIA

In [None]:
target_stocks = ['AAPL', 'MSF', 'GOOG', 'AMZN', 'TSLA', 'FB', 'NVDA']

In [None]:
filtered_df = finance_df[finance_df['stock'].isin(target_stocks)].copy()

### Cleaning the Text

In [None]:
import re
filtered_df['headline'] = filtered_df.apply(lambda row: row['headline'].lower(),axis=1) #removed capitalisation
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub("@[A-Za-z0-9_]+","", row['headline']),axis=1) #removed mentions
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub("#[A-Za-z0-9_]+","", row['headline']),axis=1) #removed hashtags
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub(r"http\S+","", row['headline']),axis=1) #removed websites
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub(r"www.\S+","", row['headline']),axis=1)
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub('[()!?]'," ", row['headline']),axis=1) #removed puncs
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub('\[.*?\]'," ", row['headline']),axis=1) 
filtered_df['headline'] = filtered_df.apply(lambda row: re.sub("[^a-z]"," ", row['headline']),axis=1)

filtered_df[['headline']].head()

In [None]:

filtered_df['Sentiment'] = filtered_df['headline'].apply(lambda x : get_sentiment_word(x))
filtered_df.head()

In [None]:
filtered_df[filtered_df["stock"]=="AAPL"]

Now we need to create posetive, negative, neutral for polarity of sentiment

In [None]:
filtered_df['sentiment_score_word'] = filtered_df['Sentiment'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

In [None]:
filtered_df.head()

In [None]:
negative_count = (filtered_df['sentiment_score_word']=='Negative').sum()
positive_count = (filtered_df['sentiment_score_word']=='Positive').sum()
neutral_count = (filtered_df['sentiment_score_word']=='Neutral').sum()
print(f"Negative: {negative_count}, Positive: {positive_count}, Neutral: {neutral_count}")

In [None]:
from sentimental_analysis import articles_sentiment_analysis
articles_sentiment_analysis(filtered_df)

In [None]:
positive_count = (filtered_df['Sentiment'] > 0).sum()  # count positive values
negative_count = (filtered_df['Sentiment'] < 0).sum()  # count negative values
zero_count = (filtered_df['Sentiment'] == 0).sum()  # count zero values

# display counts
print("Positive Count:", positive_count)
print("Negative Count:", negative_count)
print("Neutral Count:", zero_count)

labels = ['Positive', 'Negative' , 'Neutral']
sizes = [positive_count, negative_count, zero_count]
colors = ['g', 'r', 'y' ]  

# pie chart
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Distribution of Positive and Negative')
plt.show()

In [None]:
filtered_df.groupby('date')['Sentiment'].mean().plot(figsize=(20,16))

##### Number of Published articles wrt sentiment categories

In [None]:
sentiment_Categories=filtered_df['sentiment_score_word'].value_counts()
print(sentiment_Categories)

##### Categorize Sentiment by pecentage

In [None]:
number_Of_Sentiment=filtered_df['sentiment_score_word'].shape

number_Of_Sentiment

In [None]:
number_Of_Sentiment=filtered_df['sentiment_score_word'].shape
percentage_Of_Categories = np.round((sentiment_Categories/number_Of_Sentiment)*100,2)
percentage_Of_Categories.head()

## Time Series Analysis

In [None]:
# Convert date to datetime for analysis
#filtered_df['date'] = pd.to_datetime(filtered_df['date'], errors='coerce')
filtered_df.head(5)

#### Publication Frequency Over Time

In [None]:
# Group by date and count the number of publications
#filtered_df['date'] = pd.to_datetime(filtered_df['date'], errors='coerce')
filtered_df['publication_date'] = filtered_df['date'].dt.date
daily_counts = filtered_df.groupby('publication_date').size()

##### Plot daily publication frequency

In [None]:

plt.figure(figsize=(10, 6))
daily_counts.plot(kind='line', marker='o', color='red')
plt.title('Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()

##### Number of Publication per hour

In [None]:
filtered_df['date'].dt.hour

In [None]:
#### Publishing Times Analysis ###
# Extract hour from the `date` column
filtered_df['publication_hour'] = filtered_df['date'].dt.hour

# Count the number of publications by hour
hourly_counts = filtered_df['publication_hour'].value_counts().sort_index()
hourly_counts.index.name = None  # Remove index name to avoid MultiIndex
hourly_counts.name = 'count'     # Optional: set the Series name


# Plot publishing times

plt.figure(figsize=(10, 6))
hourly_counts.plot(kind='bar', color='skyblue')
plt.yscale('log')  # 👈 Use log scale
plt.title('Publication Frequency by Hour (Log Scale)')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Articles (log scale)')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()



### Analysis on Publisher

In [None]:
publisher_counts = filtered_df['publisher'].value_counts()
print("Top 10 Publishers by Article Count:")
print(publisher_counts.head(10))
print("\nLast 10 Publishers by Article Count:")
print(publisher_counts.tail(10))

### Sentiment analysis for publisher

In [None]:
from sentimental_analysis import get_sentiment_analysis_publisher

In [None]:
get_sentiment_analysis_publisher(filtered_df,'Benzinga Newsdesk')

In [None]:
### Publishers Contribution ###
# Count articles per publisher
publisher_counts = filtered_df['publisher'].value_counts()

In [None]:
# Plot publisher contribution
plt.figure(figsize=(10, 6))
sns.barplot(x=publisher_counts.values, y=publisher_counts.index, palette="magma")
plt.title("Publisher Contribution to the News Feed")
plt.xlabel("Number of Articles")
plt.ylabel("Publisher")
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.show()

In [None]:
### Extract Unique Domains from Email Addresses ###
# Identify if a publisher name is an email address
filtered_df['is_email'] = filtered_df['publisher'].str.contains('@')

# Extract domain from email addresses
filtered_df['domain'] = filtered_df['publisher'].apply(
    lambda x: x.split('@')[1] if '@' in x else None
)

# Count the number of articles by domain
domain_counts = filtered_df['domain'].value_counts()

In [None]:
print(domain_counts)

In [None]:
filtered_df.to_csv("../data/processed/filtered__news_data.csv", index=False)