In [119]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [120]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)
times = list(stock_data.index.unique())

In [None]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [125]:
df = df.to_pandas()

In [89]:
# Create topic-specific metrics columns
df['Article Count'] = 1

topics = ['airplane','airline','airport','Alaska Airlines','American Airlines','Delta Air Lines','Frontier Airlines','Hawaiian Airlines','JetBlue','Southwest Airlines','Spirit Airlines','Sun Country Airlines','United Airlines','Allegiant Air']
metrics = ['Tone','Positive Score','Negative Score','Polarity','Activity Reference Density','Self/Group Reference Density','Word Count','Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [90]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

grouped_df['datetime'] = pd.to_datetime(grouped_df.index, errors='coerce')

In [91]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.set_index('datetime').reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']

grouped_df

Unnamed: 0,index,Tone_airplane,Positive Score_airplane,Negative Score_airplane,Polarity_airplane,Activity Reference Density_airplane,Self/Group Reference Density_airplane,Word Count_airplane,Article Count_airplane,Tone_airline,...,Article Count_United Airlines,Tone_Allegiant Air,Positive Score_Allegiant Air,Negative Score_Allegiant Air,Polarity_Allegiant Air,Activity Reference Density_Allegiant Air,Self/Group Reference Density_Allegiant Air,Word Count_Allegiant Air,Article Count_Allegiant Air,datetime
0,2018-01-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.228916,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:15:00
1,2018-01-01 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:30:00
2,2018-01-01 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:45:00
3,2018-01-01 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.968254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 01:00:00
4,2018-01-01 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.259993,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 01:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259962,2025-05-31 22:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 22:45:00
259963,2025-05-31 23:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:00:00
259964,2025-05-31 23:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:15:00
259965,2025-05-31 23:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:30:00


In [92]:
##### Handling after-hours articles #####
# Convert 'datetime' to datetime type

# Move the time back by 4 hours to account for UTC
grouped_df['datetime_EST'] = grouped_df['datetime'] - pd.Timedelta(hours=4)

# Join with stock data to filter out after-hours articles
grouped_df['stock_time'] = np.where(grouped_df['datetime_EST'].isin(times), grouped_df['datetime_EST'], pd.NaT)

# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])

grouped_df = grouped_df.sort_values(by='datetime')

# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].fillna(method='bfill')

In [93]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]
grouped_df

Unnamed: 0_level_0,Tone_airplane,Positive Score_airplane,Negative Score_airplane,Polarity_airplane,Activity Reference Density_airplane,Self/Group Reference Density_airplane,Word Count_airplane,Article Count_airplane,Tone_airline,Positive Score_airline,...,Word Count_United Airlines,Article Count_United Airlines,Tone_Allegiant Air,Positive Score_Allegiant Air,Negative Score_Allegiant Air,Polarity_Allegiant Air,Activity Reference Density_Allegiant Air,Self/Group Reference Density_Allegiant Air,Word Count_Allegiant Air,Article Count_Allegiant Air
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-292.648102,398.614044,...,96267.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.641009,3.313283,...,12221.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.529022,10.078219,...,1437.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.850257,10.692406,...,1108.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.998197,8.948083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum and rolling mean
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).mean()

In [101]:
grouped_df

Unnamed: 0_level_0,Tone_airplane,Positive Score_airplane,Negative Score_airplane,Polarity_airplane,Activity Reference Density_airplane,Self/Group Reference Density_airplane,Word Count_airplane,Article Count_airplane,Tone_airline,Positive Score_airline,...,Polarity_Allegiant Air_rolling_sum26,Polarity_Allegiant Air_rolling_mean26,Activity Reference Density_Allegiant Air_rolling_sum26,Activity Reference Density_Allegiant Air_rolling_mean26,Self/Group Reference Density_Allegiant Air_rolling_sum26,Self/Group Reference Density_Allegiant Air_rolling_mean26,Word Count_Allegiant Air_rolling_sum26,Word Count_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-292.648102,398.614044,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.641009,3.313283,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,12.588905,0.484189,59.505228,2.288663,0.569842,0.021917,896.0,34.461538,3.0,0.115385
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.529022,10.078219,...,12.588905,0.484189,59.505228,2.288663,0.569842,0.021917,896.0,34.461538,3.0,0.115385
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.850257,10.692406,...,12.588905,0.484189,59.505228,2.288663,0.569842,0.021917,896.0,34.461538,3.0,0.115385
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.998197,8.948083,...,12.588905,0.484189,59.505228,2.288663,0.569842,0.021917,896.0,34.461538,3.0,0.115385


In [108]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506101117.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)