# Scraping Headlines Related to Apple

In [2]:
import requests
import pandas as pd
from textblob import TextBlob

import re
from unidecode import unidecode
from bs4 import BeautifulSoup

# Function to fetch stock-related news articles using News API:
def fetch_stock_news(ticker, lang, start_date=None, end_date=None):
    api_key = '#####'  # Replace with your actual News API key
    news_url = f'https://newsapi.org/v2/everything?q={ticker}&apiKey={api_key}&language={lang}'

    
    response = requests.get(news_url)
    news_data = response.json()

    if 'articles' in news_data:
        articles = news_data['articles']
        if start_date and end_date:
            start_date = pd.to_datetime(start_date).date()
            end_date = pd.to_datetime(end_date).date()
            articles = [article for article in articles if 'publishedAt' in article and pd.to_datetime(article['publishedAt']).date() >= start_date and pd.to_datetime(article['publishedAt']).date() <= end_date]

        return articles
    else:
        return []

# Fetch stock-related news articles between two specified dates
ticker = input("Enter Stock Ticker (e.g., AAPL for Apple): ")
start_date = input("Enter Start Date (YYYY-MM-DD): ")
end_date = input("Enter End Date (YYYY-MM-DD): ")

news_articles = fetch_stock_news(ticker,'en', start_date, end_date)
# Create a DataFrame to store the data
df = pd.DataFrame(columns=['Date', 'Time'])
dates, times, headlines = [], [], []

open_market_start = pd.to_datetime("9:30:00").time()
open_market_end = pd.to_datetime("16:00:00").time()

# Function to adjust headlines published after 4:00 pm to the following day at 9:31 am
def adjust_headline_time(date_published, time_published):
    # If the time is after 4:00 pm, increment the date and set the time to 9:31 am
    if time_published > open_market_end:
        date_published = date_published + pd.Timedelta(days=1)
        time_published = pd.to_datetime("9:31:00").time()
    if time_published < open_market_start:
        time_published = pd.to_datetime("9:31:00").time()
    return date_published, time_published



for article in reversed(news_articles):
    if 'publishedAt' in article and 'content' in article:
        date_published = pd.to_datetime(article['publishedAt']).date()
        time_published = pd.to_datetime(article['publishedAt']).time()

        # Adjust the headline time if published after 4:00 pm
        date_published, time_published = adjust_headline_time(date_published, time_published)

        # # Check if the time falls within the open stock market hours
        headline = article['title']

        # Append the data to the DataFrame
        dates.append(date_published)
        times.append(time_published)
        headlines.append(headline)

# Assign the lists to DataFrame columns
df['Date'] = dates
# First, let's convert the dates in both formats to datetime objects
df['Date'] = pd.to_datetime(df['Date'])
# Now, let's convert them back to the desired format
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
df['Time'] = times
df['Headline'] = headlines

# Sort the DataFrame based on 'Date' and 'Time'
df.sort_values(by=['Date', 'Time'], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Cleaning the text data
def clean_text(text):
    
    # Clean text by removing special characters and patterns
    cleaned_text = re.sub(r'\r\n', ' ', text)
    cleaned_text = re.sub(r'\[([^\]]+)\]', '', cleaned_text)
    cleaned_text = re.sub(r'Â', '', cleaned_text)
    cleaned_text = re.sub(r'â€™', "'", cleaned_text)  # Example: replace 'â€™' with "'"
    cleaned_text = re.sub(r'â€œ', '"', cleaned_text)  # Example: replace 'â€œ' with '"'
    cleaned_text = re.sub(r'â€', '"', cleaned_text)
    cleaned_text = unidecode(cleaned_text)
    soup = BeautifulSoup(cleaned_text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ')
    
    return cleaned_text.strip()



df['Cleaned_Headline'] = df['Headline'].apply(lambda x: pd.Series(clean_text(x)))

df.to_csv("feb_march_Headlines_English_Apple_Market_Time_6.csv")
df

  soup = BeautifulSoup(cleaned_text, 'html.parser')


Unnamed: 0,Date,Time,Headline,Cleaned_Headline
0,01/04/2024,09:31:00,Tyonit、MacBook Air/Proをスライドインするだけでディスプレイや周辺機器へ...,"Tyonit, MacBook Air/Prowosuraidoinsurudakedede..."
1,01/04/2024,11:30:31,"Up 50% Over The Last 12 Months, Is Hyatt Stock...","Up 50% Over The Last 12 Months, Is Hyatt Stock..."
2,01/04/2024,14:50:31,"Apple Sales, Earnings Seen Declining In 2024 A...","Apple Sales, Earnings Seen Declining In 2024 A..."
3,01/04/2024,15:55:19,Loop Capital cuts AAPL price target to $170 ov...,Loop Capital cuts AAPL price target to $170 ov...
4,02/04/2024,10:00:12,Will Homebuilders Like DR Horton & Pulte Group...,Will Homebuilders Like DR Horton & Pulte Group...
...,...,...,...,...
95,30/03/2024,09:31:00,Amazonの新生活SALE(FINAL)で、BuffaloやWestern Digital...,"AmazonnoXin Sheng Huo SALE(FINAL)de, Buffaloya..."
96,31/03/2024,09:31:00,Amazonの新生活SALE(FINAL)で、Twelve SouthやSatechiのMa...,"AmazonnoXin Sheng Huo SALE(FINAL)de, Twelve So..."
97,31/03/2024,09:31:00,DisplayLinkチップや3.5インチIPSスクリーンを備えたMacintosh 128...,DisplayLinkchitsupuya3.5inchiIPSsukuri-nwoBei ...
98,31/03/2024,09:31:00,Do Dominant Megacap Companies Spell Doom For S...,Do Dominant Megacap Companies Spell Doom For S...


### Further Cleaning

In [3]:
# Filter the DataFrame to include only rows with Cleaned Headline containing 'Apple' or 'AAPL'
filtered_df = df[df['Cleaned_Headline'].str.contains(r'\bApple\b|\bAAPL\b', case=False)]

# Reset the index
filtered_df.reset_index(drop=True, inplace=True)

# Print the filtered DataFrame

filtered_df.to_csv("feb_march_Headlines_English_Apple_Market_Time_6.csv")
filtered_df

Unnamed: 0,Date,Time,Headline,Cleaned_Headline
0,01/04/2024,14:50:31,"Apple Sales, Earnings Seen Declining In 2024 A...","Apple Sales, Earnings Seen Declining In 2024 A..."
1,01/04/2024,15:55:19,Loop Capital cuts AAPL price target to $170 ov...,Loop Capital cuts AAPL price target to $170 ov...
2,02/04/2024,12:32:15,Should Investors Buy Apple (AAPL) in April?,Should Investors Buy Apple (AAPL) in April?
3,03/04/2024,09:31:00,Apple、開発者向けに「macOS 14.5 Sonoma」や「iOS/iPadOS 17...,"Apple, Kai Fa Zhe Xiang keni[macOS 14.5 Sonoma..."
4,03/04/2024,09:31:00,Apple shareholders look for positive signs aft...,Apple shareholders look for positive signs aft...
5,04/04/2024,09:31:00,Senior Vice President Deirdre O'Brien Sells 54...,Senior Vice President Deirdre O'Brien Sells 54...
6,04/04/2024,09:31:00,"Apple Inc (AAPL) CEO Timothy Cook Sells 196,41...","Apple Inc (AAPL) CEO Timothy Cook Sells 196,41..."
7,04/04/2024,11:44:53,Not UK Or Japan: India To Emerge As Apple's Th...,Not UK Or Japan: India To Emerge As Apple's Th...
8,05/04/2024,09:31:00,"Apple CEO Cook sells 196,410 AAPL shares at $1...","Apple CEO Cook sells 196,410 AAPL shares at $1..."
9,05/04/2024,09:31:00,Apple to Report Earnings on May 2 Following Vi...,Apple to Report Earnings on May 2 Following Vi...


In [5]:
df1 = pd.read_csv("feb_march_Headlines_English_Apple_Market_Time_Final_5.csv").drop(["Unnamed: 0"], axis=1)
df2 = filtered_df.drop(["Headline"], axis=1)

In [8]:
df1.head(10)

Unnamed: 0,Date,Time,Cleaned_Headline
0,19/02/2024,09:31:00,There Are 2.2 Billion Reasons to Buy Apple Sto...
1,19/02/2024,09:31:00,EU regulators reportedly will fine Apple more ...
2,21/02/2024,11:36:00,"Apple, Kai Fa Zhe Xiang keni[macOS 14.4 Sonoma..."
3,21/02/2024,12:07:00,Apple's newest app is a thrilling gift for spo...
4,22/02/2024,09:31:00,"Apple, Zui Xin nomacOS 14 SonomayaiOS 17, Visi..."
5,26/02/2024,09:31:00,Horizon Kinetics: Chinese Tech Has Boosted App...
6,26/02/2024,12:54:07,This investor has a huge bet on Berkshire Hath...
7,27/02/2024,09:31:00,Major Shareholders Planning to Force Apple to ...
8,28/02/2024,09:31:00,"Apple, Kai Fa Zhe Xiang keni[macOS 14.4 Sonoma..."
9,28/02/2024,11:31:00,Apple reportedly ends its longtime plans to bu...


In [None]:
# Convert 'Time' column to datetime format
df1['Time'] = pd.to_datetime(df['Time'])

# Format the 'Time' column to include leading zeros for single-digit hour values
# df1['Time'] = df1['Time'].dt.strftime('%H:%M:%S')
df1['Time'] = df1['Time'].apply(lambda x: x.strftime('%H:%M:%S'))
df1

In [9]:
df2

Unnamed: 0,Date,Time,Cleaned_Headline
0,01/04/2024,14:50:31,"Apple Sales, Earnings Seen Declining In 2024 A..."
1,01/04/2024,15:55:19,Loop Capital cuts AAPL price target to $170 ov...
2,02/04/2024,12:32:15,Should Investors Buy Apple (AAPL) in April?
3,03/04/2024,09:31:00,"Apple, Kai Fa Zhe Xiang keni[macOS 14.5 Sonoma..."
4,03/04/2024,09:31:00,Apple shareholders look for positive signs aft...
5,04/04/2024,09:31:00,Senior Vice President Deirdre O'Brien Sells 54...
6,04/04/2024,09:31:00,"Apple Inc (AAPL) CEO Timothy Cook Sells 196,41..."
7,04/04/2024,11:44:53,Not UK Or Japan: India To Emerge As Apple's Th...
8,05/04/2024,09:31:00,"Apple CEO Cook sells 196,410 AAPL shares at $1..."
9,05/04/2024,09:31:00,Apple to Report Earnings on May 2 Following Vi...


In [22]:
# Concatenate the dataframes
concatenated_df = pd.concat([df1, df2])

# Drop duplicates based on all columns
concatenated_df = concatenated_df.drop_duplicates(keep = 'first')

# If you want to reset the index of the concatenated dataframe
concatenated_df = concatenated_df.reset_index(drop=True)
concatenated_df.to_csv("feb_march_Headlines_English_Apple_Market_Time_Final_6.csv")

In [6]:
concatenated_df

Unnamed: 0,Date,Time,Cleaned_Headline
0,19/02/2024,9:31:00,There Are 2.2 Billion Reasons to Buy Apple Sto...
1,19/02/2024,9:31:00,EU regulators reportedly will fine Apple more ...
2,21/02/2024,11:36:00,"Apple, Kai Fa Zhe Xiang keni[macOS 14.4 Sonoma..."
3,21/02/2024,12:07:00,Apple's newest app is a thrilling gift for spo...
4,22/02/2024,9:31:00,"Apple, Zui Xin nomacOS 14 SonomayaiOS 17, Visi..."
...,...,...,...
56,11/04/2024,9:31:00,Apple M1/M3chitsupuDa Zai noiMacyaStudio Displ...
57,11/04/2024,10:57:23,"Judge in Apple antitrust case steps down, due ..."
58,12/04/2024,9:31:00,Apple's Services business is a bright spot in ...
59,12/04/2024,13:06:54,Apple's earnings power is once again being ove...


# Dataset for Sentiment Analysis

In [26]:
# Filter the DataFrame to include only rows with Cleaned Headline containing 'Apple' or 'AAPL'
sentiment_df = concatenated_df[['Cleaned_Headline']].copy()

# Print the filtered DataFrame
sentiment_df = sentiment_df.dropna(axis = 0)
sentiment_df.to_csv("feb_march_Headlines_English_Apple_Market_Time_Sentiment_6.csv")
sentiment_df

Unnamed: 0,Cleaned_Headline
0,There Are 2.2 Billion Reasons to Buy Apple Sto...
1,EU regulators reportedly will fine Apple more ...
2,"Apple, Kai Fa Zhe Xiang keni[macOS 14.4 Sonoma..."
3,Apple's newest app is a thrilling gift for spo...
4,"Apple, Zui Xin nomacOS 14 SonomayaiOS 17, Visi..."
...,...
59,Apple's earnings power is once again being ove...
60,Zong Shu kiYuan Gao Yong Zhi yashinariohuo-muB...
61,Is Apple (AAPL) Facing Increasing Risk of Chin...
62,"Analysts mostly non-plussed by DoJ suit, and b..."


# Downloading Hourly Apple Stock Prices

In [4]:
import yfinance as yf
import pandas as pd

# Define the ticker symbol
ticker_symbol = 'AAPL'

# Set the start and end date for the historical data
start_date = '2024-02-16'
end_date = '2024-04-15'

data = yf.download(ticker_symbol, start=start_date, end=end_date, interval='1h')
data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-02-16 09:30:00-05:00,183.740005,184.850006,181.860001,182.544998,182.544998,14074668
2024-02-16 10:30:00-05:00,182.550003,183.447296,182.419998,183.149994,183.149994,5935048
2024-02-16 11:30:00-05:00,183.154999,183.289993,182.720001,182.940094,182.940094,3563098
2024-02-16 12:30:00-05:00,182.949997,183.009995,182.479996,182.649994,182.649994,3133873
2024-02-16 13:30:00-05:00,182.656998,183.179993,182.485001,183.143005,183.143005,3501742
...,...,...,...,...,...,...
2024-04-12 11:30:00-04:00,176.259995,176.990005,175.565002,175.589996,175.589996,8240618
2024-04-12 12:30:00-04:00,175.595001,175.660004,174.800003,175.350006,175.350006,8466020
2024-04-12 13:30:00-04:00,175.339996,176.000000,175.235001,175.970001,175.970001,6848567
2024-04-12 14:30:00-04:00,175.970001,176.259995,175.630005,176.017502,176.017502,8125126


In [5]:
data['Datetime'] = pd.to_datetime(data.index)

# Creating a new DataFrame with desired format
data = pd.DataFrame({
    'Datetime': data['Datetime'],
    'Date': data['Datetime'].dt.date,
    'Time': data['Datetime'].dt.strftime('%H:%M:%S'),
    'Adj Close': data['Adj Close']
})

# Display the new DataFrame
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,Datetime,Date,Time,Adj Close
0,2024-02-16 09:30:00-05:00,2024-02-16,09:30:00,182.544998
1,2024-02-16 10:30:00-05:00,2024-02-16,10:30:00,183.149994
2,2024-02-16 11:30:00-05:00,2024-02-16,11:30:00,182.940094
3,2024-02-16 12:30:00-05:00,2024-02-16,12:30:00,182.649994
4,2024-02-16 13:30:00-05:00,2024-02-16,13:30:00,183.143005
...,...,...,...,...
268,2024-04-12 11:30:00-04:00,2024-04-12,11:30:00,175.589996
269,2024-04-12 12:30:00-04:00,2024-04-12,12:30:00,175.350006
270,2024-04-12 13:30:00-04:00,2024-04-12,13:30:00,175.970001
271,2024-04-12 14:30:00-04:00,2024-04-12,14:30:00,176.017502


In [None]:
data.to_csv("feb_march_Apple_stocks_6.csv")