In [1]:
import pandas as pd
import numpy as np

In [2]:
stock_df = pd.read_csv("../../stock_price/stock_price_v2/stock_price_v2.csv",
                       parse_dates=["time"],
                       usecols=["symbol", "time", "close"]
                      )
stock_df.rename(columns={"time": "datetime"}, inplace=True)
stock_df.drop_duplicates(inplace=True)

# check no duplicate datetime per symbol
assert stock_df.groupby("symbol").apply(lambda x: x["datetime"].duplicated().any()).any() == False

# filter datetime
stock_df = stock_df.loc[stock_df["datetime"] >= pd.Timestamp("2022-02-25 08:00:00")]

# create date and time columns
# stock_df["date"] = stock_df["datetime"].dt.normalize()
# stock_df["time"] = stock_df["datetime"].dt.time

In [3]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12187398 entries, 0 to 21989495
Data columns (total 3 columns):
 #   Column    Dtype         
---  ------    -----         
 0   datetime  datetime64[ns]
 1   close     float64       
 2   symbol    object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 371.9+ MB


In [4]:
stock_df = stock_df.set_index(["symbol", "datetime"]).sort_index()[["close"]]

stock_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close
symbol,datetime,Unnamed: 2_level_1
A,2022-02-25 09:35:00,128.007913
A,2022-02-25 09:40:00,128.574232
A,2022-02-25 09:45:00,128.206622
A,2022-02-25 09:50:00,128.653716
A,2022-02-25 09:55:00,128.455007


## News

In [5]:
news_df = pd.read_csv("../combine_pred/combine_pred.csv", parse_dates=["time_published"])

In [6]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88167 entries, 0 to 88166
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   index                    88167 non-null  int64         
 1   title                    88167 non-null  object        
 2   url                      88167 non-null  object        
 3   time_published           88167 non-null  datetime64[ns]
 4   authors                  88167 non-null  object        
 5   summary                  88166 non-null  object        
 6   banner_image             87349 non-null  object        
 7   source                   88167 non-null  object        
 8   category_within_source   20361 non-null  object        
 9   source_domain            88167 non-null  object        
 10  topics                   88167 non-null  object        
 11  overall_sentiment_score  88167 non-null  float64       
 12  overall_sentiment_label  88167 n

In [7]:
news_df.sample(10)

Unnamed: 0,index,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,...,lexicon_neg,lexicon_neu,lexicon_pos,lexicon_compound,finbertv1_negative,finbertv1_neutral,finbertv1_positive,finbertv2_Negative,finbertv2_Neutral,finbertv2_Positive
23668,55990,"Bitcoin, Ethereum, Dogecoin Spike Amid Risk-On...",https://www.benzinga.com/markets/cryptocurrenc...,2023-01-30 02:26:43,['Mehab Qureshi'],Major coins traded in the green on Sunday even...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,...,0.0,1.0,0.0,0.0,0.019106,0.038927,0.941967,0.0007061056,0.8886214,0.1106724
38066,95936,Want Better Returns? Don?t Ignore These 2 Cons...,https://www.zacks.com/stock/news/2032237/want-...,2022-12-26 13:50:08,['Zacks Investment Research'],Why investors should use the Zacks Earnings ES...,https://staticx-tuner.zacks.com/images/default...,Zacks Commentary,,www.zacks.com,...,0.143,0.714,0.143,0.0,0.032257,0.813108,0.154635,4.022257e-07,4.205933e-07,0.9999992
17015,38703,IDEXX Laboratories Stock Clears Technical Benc...,https://www.investors.com/news/idexx-laborator...,2023-01-09 19:37:00,"[""INVESTOR'S BUSINESS DAILY"", 'JULIE MAK', ""In...",IDEXX Laboratories Stock Clears Technical Benc...,https://www.investors.com/wp-content/uploads/2...,Investors Business Daily,,www.investors.com,...,0.0,1.0,0.0,0.0,0.029725,0.047696,0.922579,8.556109e-07,0.9999988,3.761108e-07
2604,6505,Check Out What Whales Are Doing With TGT - Tar...,https://www.benzinga.com/markets/options/22/10...,2022-10-03 19:21:44,['Benzinga Insights'],Someone with a lot of money to spend has taken...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,...,0.0,1.0,0.0,0.0,0.049384,0.885959,0.064657,0.01136416,0.01925217,0.9693837
25298,59787,Better AI Stock: Amazon vs. Nvidia,https://www.fool.com/investing/2023/03/12/bett...,2023-03-12 12:38:00,['Dani Cook'],These companies will likely play significant r...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,...,0.0,1.0,0.0,0.0,0.007341,0.662978,0.32968,1.608308e-05,0.9999778,6.104092e-06
29962,75848,"Hershey ( NYSE:HSY ) Raised to ""Buy"" at Stoc...",https://reporter.am/2022/10/13/hershey-nysehsy...,2022-10-13 08:20:44,['AM Reporter Staff'],StockNews.com upgraded shares of Hershey ( NYS...,https://www.marketbeat.com/scripts/RatingsAndP...,Stocknews.com,,reporter.am,...,0.0,1.0,0.0,0.0,0.018061,0.090849,0.89109,5.505725e-06,0.01027896,0.9897155
78624,207648,Here's Why Investors Should Retain Generac ( ...,https://www.zacks.com/stock/news/1969756/heres...,2022-08-18 15:11:00,['Zacks Investment Research'],Generac's (GNRC) performance is gaining from i...,https://staticx-tuner.zacks.com/images/article...,Zacks Commentary,,www.zacks.com,...,0.0,0.75,0.25,0.4588,0.013141,0.031615,0.955244,2.199114e-09,4.256767e-10,1.0
52723,137353,Here's Why Momentum in Lamb Weston ( LW ) Sh...,https://www.zacks.com/stock/news/1967619/heres...,2022-08-12 12:50:06,['Zacks Investment Research'],Lamb Weston (LW) could be a great choice for i...,https://staticx-tuner.zacks.com/images/default...,Zacks Commentary,,www.zacks.com,...,0.0,0.902,0.098,0.4588,0.008214,0.272354,0.719432,1.570418e-07,2.706698e-07,0.9999996
41688,108058,Johnson & Johnson ( JNJ ) Outpaces Stock Mar...,https://www.zacks.com/stock/news/2071013/johns...,2023-03-27 21:45:19,['Zacks Investment Research'],"In the latest trading session, Johnson & Johns...",https://staticx-tuner.zacks.com/images/default...,Zacks Commentary,,www.zacks.com,...,0.0,1.0,0.0,0.0,0.085899,0.020148,0.893953,8.990368e-05,0.9999006,9.4927e-06
83948,230621,"General Electric ( GE ) Arm, NTPC Ink MOU on...",https://www.zacks.com/stock/news/1989928/gener...,2022-10-10 18:27:00,['Zacks Investment Research'],General Electric (GE) arm signs a MOU with Ind...,https://staticx-tuner.zacks.com/images/article...,Zacks Commentary,,www.zacks.com,...,0.0,1.0,0.0,0.0,0.013681,0.10114,0.885179,4.198938e-05,0.9999243,3.373481e-05


In [8]:
min_time_published  = news_df["time_published"].min()
max_time_published  = news_df["time_published"].max()

print(min_time_published)
print(max_time_published)

2022-03-01 08:00:00
2023-03-31 16:32:05


## Holding period

- 30_minutes
- 1_hour
- 1_day
- 5_day

Edge cases
- Start of holding period is outside trading hours
    - Start: End of last trading day price 
    - End: First price after holding period
- Start of holding period is within trading hours but end of holding period is after trading hours
    - Start: Price right before the news
    - End: First price after holding period

In [9]:
min_datetime = stock_df.index.get_level_values(1).min()
max_datetime = stock_df.index.get_level_values(1).max()

def fill_missing_minutes(df, method):
    """
    Forward/backward fill missing minutes.
    """
    df.reset_index(level="symbol", drop=True, inplace=True)
    index = pd.date_range(start=min_datetime, end=max_datetime, freq='5T')
    # reindex and forward fill 
    filled_df = df.reindex(index, method=method)
    filled_df.index.name = "datetime"
    return filled_df

ffill_stock_df = stock_df.groupby("symbol").apply(fill_missing_minutes, method="ffill").rename(columns={"close":"ffill"})
bfill_stock_df = stock_df.groupby("symbol").apply(fill_missing_minutes, method="bfill").rename(columns={"close":"bfill"})

In [10]:
# a_stock_df = pd.concat([ffill_stock_df.loc['A'], bfill_stock_df.loc['A'], stock_df.loc['A']], axis=1)

# a_stock_df.loc[a_stock_df['close'].isna()]

# a_stock_df.loc['2022-02-25'].tail(50)

In [11]:
holding_period_list = ["30_minutes", "1_hour", "1_day", "5_day"]

for holding_period in holding_period_list:
    value, unit = holding_period.split("_")
    news_df["time_published_hold"] = news_df["time_published"] + pd.Timedelta(value=float(value), unit=unit)

    news_df["time_published_before"] = news_df["time_published"].dt.floor("5T")
    news_df[f"time_published_after_{holding_period}"] = news_df["time_published_hold"].dt.ceil("5T")

In [12]:
cols = [f"time_published_after_{holding_period}" for holding_period in holding_period_list]
news_df[["time_published", "time_published_before"] + cols]

Unnamed: 0,time_published,time_published_before,time_published_after_30_minutes,time_published_after_1_hour,time_published_after_1_day,time_published_after_5_day
0,2022-03-08 23:00:11,2022-03-08 23:00:00,2022-03-08 23:35:00,2022-03-09 00:05:00,2022-03-09 23:05:00,2022-03-13 23:05:00
1,2022-03-15 22:00:18,2022-03-15 22:00:00,2022-03-15 22:35:00,2022-03-15 23:05:00,2022-03-16 22:05:00,2022-03-20 22:05:00
2,2022-03-15 08:00:00,2022-03-15 08:00:00,2022-03-15 08:30:00,2022-03-15 09:00:00,2022-03-16 08:00:00,2022-03-20 08:00:00
3,2022-03-04 15:24:00,2022-03-04 15:20:00,2022-03-04 15:55:00,2022-03-04 16:25:00,2022-03-05 15:25:00,2022-03-09 15:25:00
4,2022-03-21 22:15:23,2022-03-21 22:15:00,2022-03-21 22:50:00,2022-03-21 23:20:00,2022-03-22 22:20:00,2022-03-26 22:20:00
...,...,...,...,...,...,...
88162,2023-02-10 23:45:00,2023-02-10 23:45:00,2023-02-11 00:15:00,2023-02-11 00:45:00,2023-02-11 23:45:00,2023-02-15 23:45:00
88163,2023-03-01 21:28:00,2023-03-01 21:25:00,2023-03-01 22:00:00,2023-03-01 22:30:00,2023-03-02 21:30:00,2023-03-06 21:30:00
88164,2023-03-13 19:06:41,2023-03-13 19:05:00,2023-03-13 19:40:00,2023-03-13 20:10:00,2023-03-14 19:10:00,2023-03-18 19:10:00
88165,2023-03-21 18:00:54,2023-03-21 18:00:00,2023-03-21 18:35:00,2023-03-21 19:05:00,2023-03-22 18:05:00,2023-03-26 18:05:00


In [13]:
news_stock_df = news_df.merge(
    ffill_stock_df, left_on=["symbol", "time_published_before"], right_index=True, how="left").rename(
    columns={"ffill":"price_before"}
)

for holding_period in holding_period_list:
    news_stock_df = news_stock_df.merge(
        bfill_stock_df, left_on=["symbol", f"time_published_after_{holding_period}"], right_index=True, how="left").rename(
        columns={"bfill":f"price_after_{holding_period}"})
    
for holding_period in holding_period_list:
    news_stock_df[f"pct_return_{holding_period}"] = news_stock_df[f"price_after_{holding_period}"] / news_stock_df["price_before"] - 1

In [14]:
cols = [f"pct_return_{holding_period}" for holding_period in holding_period_list]
news_stock_full_df = news_stock_df.dropna(subset=cols)

In [15]:
news_stock_full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86537 entries, 0 to 88165
Data columns (total 45 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   index                            86537 non-null  int64         
 1   title                            86537 non-null  object        
 2   url                              86537 non-null  object        
 3   time_published                   86537 non-null  datetime64[ns]
 4   authors                          86537 non-null  object        
 5   summary                          86536 non-null  object        
 6   banner_image                     85728 non-null  object        
 7   source                           86537 non-null  object        
 8   category_within_source           19948 non-null  object        
 9   source_domain                    86537 non-null  object        
 10  topics                           86537 non-null  object   

In [16]:
news_stock_full_df.to_csv("holding_period.csv", index=False)