In [1]:
import pandas as pd
from datetime import datetime
from datetime import timedelta 

In [2]:

# Load the news dataset, specifying that any extra commas are part of quoted titles
# This will generate warnings for problematic lines
news_df = pd.read_csv('pulse.csv', names=['headline', 'url', 'publish_date'], header=None, quotechar='"', on_bad_lines='skip')

# After loading the data, print the number of rows in the DataFrame
print(f"Total rows loaded: {len(news_df)}")

# Optionally, print the first few rows to verify
print(news_df.head())


Total rows loaded: 421971
                                            headline  \
0  Production outages in Asia lend slight support...   
1  UK consumers suffer longest decline in spendin...   
2  Market Now: Gammon Infra, GVK Power Infra surg...   
3  Global markets: Brightening economy sets euro ...   
4  Relying on schemes like Swachh Bharat alone wo...   

                                                 url         publish_date  
0  http://feeds.reuters.com/~r/reuters/INbusiness...  2017-06-30 15:03:52  
1  http://feeds.reuters.com/~r/reuters/INbusiness...  2017-06-30 15:03:11  
2  http://economictimes.indiatimes.com/markets/st...  2017-06-30 15:01:36  
3  http://feeds.reuters.com/~r/reuters/INbusiness...  2017-06-30 15:00:36  
4  http://www.business-standard.com/article/econo...  2017-06-30 14:59:00  


In [3]:
news_df['publish_date'] = pd.to_datetime(news_df['publish_date'], errors='coerce').dt.date

news_df

Unnamed: 0,headline,url,publish_date
0,Production outages in Asia lend slight support...,http://feeds.reuters.com/~r/reuters/INbusiness...,2017-06-30
1,UK consumers suffer longest decline in spendin...,http://feeds.reuters.com/~r/reuters/INbusiness...,2017-06-30
2,"Market Now: Gammon Infra, GVK Power Infra surg...",http://economictimes.indiatimes.com/markets/st...,2017-06-30
3,Global markets: Brightening economy sets euro ...,http://feeds.reuters.com/~r/reuters/INbusiness...,2017-06-30
4,Relying on schemes like Swachh Bharat alone wo...,http://www.business-standard.com/article/econo...,2017-06-30
...,...,...,...
421966,Chennai Angels invests in SparesHub owner Iradium,http://www.thehindu.com/business/Industry/chen...,1970-01-01
421967,Task force formed on aviation,http://www.thehindu.com/business/Economy/task-...,1970-01-01
421968,Biological E in vaccine tie-up with Takeda,http://www.thehindu.com/business/Economy/biolo...,1970-01-01
421969,HPCL joins talks for Russian oilfields,http://www.thehindu.com/business/Economy/hpcl-...,1970-01-01


In [6]:
def load_financial_data(file_path):
    # Load the financial data and skip the first row that contains column names
    df_extrema = pd.read_csv(file_path, header=0, names=["date","type","ticker","price"])

    # Convert the date column to datetime format (ensure it's timezone-aware, if needed)
    df_extrema['date'] = pd.to_datetime(df_extrema['date'], errors='coerce')

    
# Convert the 'date' column in df_extrema to datetime, and then extract only the date part
    df_extrema['date'] = pd.to_datetime(df_extrema['date'], errors='coerce').dt.date
    df_extrema
    
    return df_extrema

In [7]:
def get_news_for_extrema(df_extrema, news_df):
    results = []
    
    # Iterate through each extrema entry
    for _, row in df_extrema.iterrows():
        extrema_date = row['date']
        # Filter news articles published within ±2 days of the extrema date
        start_date = extrema_date - timedelta(days=2)
        end_date = extrema_date + timedelta(days=2)
        
        # Filter news articles within the date range
        relevant_news = news_df[(news_df['publish_date'] >= start_date) & (news_df['publish_date'] <= end_date)]
        
        # If there are relevant news articles, append them to the results list
        if not relevant_news.empty:
            results.append({
                'extrema_date': extrema_date,
                'extrema_type': row['type'],
                'ticker': row['ticker'],
                'extrema_price': row['price'],
                'relevant_news': relevant_news[['headline', 'publish_date', 'url']]
            })
    
    return results

In [8]:
# Function to get news articles within ±2 days of each extrema (ignoring the time)
def get_news_for_extrema(df_extrema, news_df):
    results = []
    
    # Iterate through each extrema entry
    for _, row in df_extrema.iterrows():
        extrema_date = row['date']
        
        # Filter news articles published within ±2 days of the extrema date
        start_date = extrema_date - timedelta(days=2)
        end_date = extrema_date + timedelta(days=2)
        
        # Filter news articles within the date range
        relevant_news = news_df[(news_df['publish_date'] >= start_date) & (news_df['publish_date'] <= end_date)]
        
        # If there are relevant news articles, append them to the results list
        if not relevant_news.empty:
            results.append({
                'extrema_date': extrema_date,
                'extrema_type': row['type'],
                'ticker': row['ticker'],
                'extrema_price': row['price'],
                'relevant_news': relevant_news[['headline', 'publish_date', 'url']]
            })
    
    return results

In [11]:
load_financial_data('extrema_dates.csv')

  df_extrema['date'] = pd.to_datetime(df_extrema['date'], errors='coerce')


Unnamed: 0,date,type,ticker,price
0,2014-02-03,local_minima,^IXIC,3996.959961
1,2014-03-03,local_minima,^IXIC,4277.299805
2,NaT,local_minima,^IXIC,4245.399902
3,NaT,local_minima,^IXIC,4151.229980
4,NaT,local_minima,^IXIC,3999.729980
...,...,...,...,...
313,NaT,local_maxima,^IXIC,18712.750000
314,2024-11-11,local_maxima,^IXIC,19298.759766
315,2024-12-16,local_maxima,^IXIC,20173.890625
316,2024-12-16,global_max,^IXIC,20173.890625


In [None]:

news_for_extrema = get_news_for_extrema(df_extrema, news_df)

# Example: Display news for the first extrema in the result
if news_for_extrema:
    print(news_for_extrema[0])

  df_extrema['date'] = pd.to_datetime(df_extrema['date'], errors='coerce')


NameError: name 'df_extrema' is not defined

In [93]:

# Match the news with the financial data
matched_news_df = clean_and_match_news(news_df, df_extrema)


TypeError: unsupported operand type(s) for -: 'DatetimeArray' and 'datetime.date'

In [53]:

# Display the matched results
print(matched_news_df.head())

NameError: name 'matched_news_df' is not defined