In [38]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime
warnings.filterwarnings('ignore')

In [39]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)
times = list(stock_data.index.unique())

In [40]:
df = pl.read_csv('../../data/processed/gdelt_combined_20250610_1716.csv')

In [41]:
# For each column, drop if all values are equal to 0
for col in df.columns:
    if df[col].min() == df[col].max():
        df = df.drop(col)
df.shrink_to_fit(in_place=True)

# Convert to smaller data types
df = df.select(pl.all().shrink_dtype())

In [42]:
# Drop duplicate rows
df = df.unique(maintain_order=True)

In [43]:
# Remove a list of titles that are probably home pages or other non-article content
titles_to_remove = [
    'News briefs',
    'Latest Articles',
    'The Nashville Ledger',
    'Today in History',
    'National News - Media One Radio Group (WWSE | WJTN | WHUG | WKSN | WQFX',
    "Aero-News Network: The aviation and aerospace world's daily/real-time news and information service",
    'Drake & 21 Savage Add More Texas Concert Dates Due To High Demand',
    'Stock Market | FinancialContent Business Page',
    'Radio Station WHMI 93.5 FM &#x2014; Livingston County Michigan News, Weather, Traffic, Sports, School Updates, and the Best Classic Hit',
    'National - KSYL-AM',
    'Business Highlights',
    'Business Highlights',
    'National News - 1540 WADK Newport',
    'National - Carroll Broadcasting Inc.',
    'ABC National - WOND',
    "Breaking National News - 92.7-FM TheDRIVE - Bob & Tom Mornings, Central New York's Best Rock All Day",
    "ABC - National News - Xtra 99.1 FM - Today's Hits and Yesterday's Favorites",
    "Ed Bruce, Legendary Country Songwriter, 'Maverick' Actor, Dead At 81",
    "ABC National News - Beach 95.1 - WBPC Panama City Beach Greatest Hits of the 60s, 70s & 80s",
    "CES gadget show: How watching TV will change in the 2020s",
    "Despite business warnings, GOP moves ahead with voting bills",
    "KTBB.com - News Weather Talk",
    "AP Story",
    "SRN - US News - Taylorville Daily News",
]

# Replace empty article titles with z
df = df.with_columns(
    pl.col('article_title').fill_null('z')
)

df = df.filter(
    ~df['article_title'].is_in(titles_to_remove)
)

# Drop where article_title contains 'AP News in Brief'
df = df.filter(
    ~df['article_title'].str.contains('AP News in Brief at', literal=True)
    )

In [44]:
site_counts = df['V2SOURCECOMMONNAME'].value_counts().to_pandas()
site_counts

Unnamed: 0,V2SOURCECOMMONNAME,count
0,thecragandcanyon.ca,11
1,talkpoverty.org,1
2,indy100.com,3
3,conservativehq.com,16
4,northernpublicradio.org,389
...,...,...
14861,moabtimes.com,4
14862,21ninety.com,1
14863,localsyr.com,109
14864,en.qantara.de,3


In [45]:
# Filter to reliable news sites. I don't want to limit to a certain set of sites because I want to keep local news in

# The idea is to remove sites that are unlikely to report about price-influencing stories. For example, some sites report about cheap flights or travel points
sites_to_remove = [
'iheart.com',
'thepointsguy.com',
'eturbonews.com',
'indiatimes.com',
'forimmediaterelease.net',
'travelmarketreport.com'
]

df = df.filter(
    ~df['V2SOURCECOMMONNAME'].is_in(sites_to_remove)
)

In [46]:
df

GKGRECORDID,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,airplane,airline,airport,Alaska Airlines,American Airlines,Delta Air Lines,Frontier Airlines,Hawaiian Airlines,JetBlue,Southwest Airlines,Spirit Airlines,Sun Country Airlines,United Airlines,Allegiant Air,article_title,Tone,Positive Score,Negative Score,Polarity,Activity Reference Density,Self/Group Reference Density,Word Count,c1.1; WORDCOUNT; AESTHETIC,c1.2; WORDCOUNT; ECONOMIC/UTILITARIAN,c1.3; WORDCOUNT; LIFE SUPPORT,c1.4; WORDCOUNT; MORAL/SPIRITUAL,c2.1; WORDCOUNT; ABS,c2.2; WORDCOUNT; AFFGAIN,c2.3; WORDCOUNT; AFFLOSS,c2.4; WORDCOUNT; AFFOTH,c2.5; WORDCOUNT; AFFPT,…,c39.25; WORDCOUNT; Young Person,c39.26; WORDCOUNT; Old Person,c39.27; WORDCOUNT; Adultness,c39.28; WORDCOUNT; Earliness,c39.29; WORDCOUNT; Lateness,c39.30; WORDCOUNT; Occasion,c39.31; WORDCOUNT; Intempestivity,c39.32; WORDCOUNT; Frequency,c39.33; WORDCOUNT; Infrequency,c39.34; WORDCOUNT; Regularity,c39.35; WORDCOUNT; Fitfulness,c39.36; WORDCOUNT; Absolute Time,c39.37; WORDCOUNT; Relative Time,c39.38; WORDCOUNT; Recurrent Time,c39.39; WORDCOUNT; Past,c39.40; WORDCOUNT; Present,c39.41; WORDCOUNT; Future,c40.1; WORDCOUNT; Earth,c40.2; WORDCOUNT; Fire,c40.3; WORDCOUNT; Water,c40.4; WORDCOUNT; Air,c40.5; WORDCOUNT; Melancholy,c40.6; WORDCOUNT; Sanguine,c40.7; WORDCOUNT; Coleric,c40.8; WORDCOUNT; Flegmatic,c41.1; WORDCOUNT; POSITIVE,c42.1; WORDCOUNT; moralwordcount,v42.2; SCOREDVALUE; care_p,v42.3; SCOREDVALUE; fairness_p,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent
str,str,str,str,str,str,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,str,f32,f32,f32,f32,f32,f32,i32,i16,i16,i16,i16,i16,i16,i16,i16,i16,…,i16,i16,i8,i16,i16,i16,i8,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""20230221030000-453""","""aero-news.net""","""http://aero-news.net/index.cfm…","""TAX_WORLDREPTILES;TAX_WORLDREP…","""2023-02-21 03:00:00""","""2023-02-21""",0,1,0,0,1,0,0,0,0,0,0,0,0,0,"""Airborne 02.20.23: Hobby Ballo…",-1.228501,2.457002,3.685504,6.142506,17.936117,0.4914,333,1,0,0,0,9,0,0,0,0,…,0,1,0,2,2,0,0,1,0,2,0,9,18,3,5,1,6,1,0,0,7,0,0,0,3,6,58,0.101615,0.094112,0.084485,0.096285,0.065091,-0.117746,-0.047186,-0.009921,-0.062596,-0.075467
"""20250117234500-581""","""houstonchronicle.com""","""http://www.houstonchronicle.co…","""TAX_FNCACT;TAX_FNCACT_OFFICIAL…","""2025-01-17 23:45:00""","""2025-01-17""",0,1,0,0,1,0,0,0,0,1,0,0,1,0,"""Trump's inauguration will be h…",0.41841,1.25523,0.83682,2.09205,24.790794,0.209205,883,0,0,2,0,32,1,0,0,1,…,0,0,0,3,0,6,0,0,0,1,0,3,39,1,8,1,21,0,0,0,1,1,0,0,1,21,272,0.078913,0.075865,0.081348,0.085938,0.067908,-0.102076,-0.046182,-0.011943,-0.043249,-0.057695
"""20240527093000-676""","""itechpost.com""","""http://www.itechpost.com/artic…","""TAX_FNCACT;TAX_FNCACT_EMPLOYEE…","""2024-05-27 09:30:00""","""2024-05-27""",0,1,1,0,0,0,0,0,0,0,0,0,1,0,"""Boeing Safety, Product Quality…",-3.271028,3.971963,7.24299,11.214953,17.990654,0.0,395,0,0,0,0,14,0,0,0,5,…,0,0,0,1,1,2,0,0,0,0,0,8,19,1,8,1,6,0,0,0,4,0,0,0,1,46,149,0.109993,0.101007,0.088567,0.09069,0.078992,-0.115803,-0.063174,-0.039071,-0.037974,-0.090853
"""20250410171500-386""","""1025kiss.com""","""https://1025kiss.com/ixp/175/p…",,"""2025-04-10 17:15:00""","""2025-04-10""",0,1,0,0,1,0,0,0,0,0,0,0,0,0,"""Breaking Down Luka's First Gam…",0.123305,3.205919,3.082614,6.288533,27.743526,0.986436,776,0,3,0,0,35,4,1,3,3,…,0,1,0,5,0,0,0,0,0,2,0,14,34,2,17,3,11,2,4,0,0,3,1,1,0,19,192,0.098206,0.092551,0.084106,0.084379,0.078339,-0.094771,-0.025998,0.001789,-0.025604,-0.013816
"""20241120131500-415""","""710keel.com""","""https://710keel.com/ixp/182/p/…","""ECON_DEBT;WB_1104_MACROECONOMI…","""2024-11-20 13:15:00""","""2024-11-20""",0,1,0,0,0,0,0,0,0,0,1,0,1,0,"""Will Spirit Airlines Bankruptc…",-1.453488,3.197675,4.651163,7.848837,28.197674,1.453488,307,0,0,0,0,9,0,0,5,2,…,0,0,0,1,0,2,0,0,0,0,0,4,11,0,4,1,5,0,0,0,2,1,0,0,0,20,88,0.109762,0.106656,0.100371,0.092308,0.0889,-0.050954,-0.023068,0.021486,0.009874,-0.006657
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""20221216140000-613""","""wsiu.org""","""https://www.wsiu.org/2022-12-1…","""TAX_WORLDMAMMALS;TAX_WORLDMAMM…","""2022-12-16 14:00:00""","""2022-12-16""",0,1,0,0,0,0,0,0,0,0,0,0,1,0,"""They flew to Nashville. Bluebe…",-1.75644,1.522248,3.278688,4.800937,22.131147,0.936768,783,0,1,0,0,15,0,2,16,4,…,0,1,0,2,1,0,0,2,0,0,0,5,20,2,6,0,12,0,0,1,7,1,0,3,1,27,198,0.096294,0.09364,0.083392,0.071799,0.066122,-0.100614,-0.043287,-0.001136,-0.007722,-0.044429
"""20190719014500-2143""","""wtol.com""","""https://www.wtol.com/article/n…","""MANMADE_DISASTER_IMPLIED;DELAY…","""2019-07-19 01:45:00""","""2019-07-19""",0,1,0,0,1,0,0,0,0,1,0,0,0,0,"""z""",-1.813472,0.518135,2.331606,2.849741,22.279793,0.0,364,0,0,0,0,25,1,0,1,0,…,0,0,0,2,3,0,0,0,0,1,0,6,26,2,8,1,14,0,0,0,5,0,0,0,0,12,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""20190719014500-887""","""wwltv.com""","""https://www.wwltv.com/article/…","""MANMADE_DISASTER_IMPLIED;DELAY…","""2019-07-19 01:45:00""","""2019-07-19""",0,1,0,0,1,0,0,0,0,1,0,0,0,0,"""z""",-1.813472,0.518135,2.331606,2.849741,22.279793,0.0,364,0,0,0,0,25,1,0,1,0,…,0,0,0,2,3,0,0,0,0,1,0,6,26,2,8,1,14,0,0,0,5,0,0,0,0,12,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""20180120063000-1245""","""yahoo.com""","""https://www.yahoo.com/news/del…","""UNREST_CRACKDOWN;TAX_WORLDBIRD…","""2018-01-20 06:30:00""","""2018-01-20""",0,1,0,0,1,1,0,0,0,0,0,0,0,0,"""z""",-0.879397,4.271357,5.150754,9.422111,25.628141,1.130653,715,0,0,1,0,17,0,0,1,2,…,0,0,0,2,0,0,0,1,2,0,0,3,14,3,3,2,9,0,0,1,10,4,1,0,0,50,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# Extracting headlines from URLs when article title is empty
df['V2DOCUMENTIDENTIFIER'].value_counts()

V2DOCUMENTIDENTIFIER,count
str,u32
"""https://www.havasunews.com/new…",1
"""https://www.mcall.com/2024/07/…",1
"""https://www.africaleader.com/n…",1
"""https://www.travelawaits.com/2…",1
"""https://www.hppr.org/2022-12-2…",1
…,…
"""https://www.christianpost.com:…",1
"""http://www.yourerie.com/news/w…",1
"""https://beatricedailysun.com/n…",1
"""https://www.rgj.com/story/opin…",1


In [37]:
df['url'] = df['V2DOCUMENTIDENTIFIER']
# Remove the protocol (http:// or https://) and the domain name
df = df.with_columns(
    pl.col('url').str.replace(r'^(https?://)', '', literal=True)
)
df['url']

TypeError: DataFrame object does not support `Series` assignment by index

Use `DataFrame.with_columns`.

In [8]:
# Save df to pickle file
with open('../../data/processed/gdelt_intermediate_cleaned.pkl', 'wb') as f:
    pickle.dump(df, f)

In [19]:
df_themes=df['V1THEMES'].to_pandas()


In [20]:
df_themes = df_themes.str.split(';')
df_themes = df_themes.explode()
df_themes.dropna()

0                     TAX_WORLDREPTILES
0          TAX_WORLDREPTILES_SIDEWINDER
0                                AFFECT
0                            TAX_FNCACT
0                     TAX_FNCACT_PILOTS
                       ...             
1250841            TAX_FNCACT_SPOKESMAN
1250841                 TAX_FNCACT_KING
1250841                PUBLIC_TRANSPORT
1250841          TAX_FNCACT_SPOKESWOMAN
1250841                                
Name: V1THEMES, Length: 45681859, dtype: object

In [21]:
df_themes.value_counts()

V1THEMES
                                    1177679
TAX_FNCACT                          1074944
TAX_ETHNICITY                        735254
WB_135_TRANSPORT                     721094
SOC_POINTSOFINTEREST                 647866
                                     ...   
TAX_WORLDLANGUAGES_LAUBE                  1
TAX_DISEASE_HEPATITIS_E                   1
TAX_FNCACT_BOOK_AGENTS                    1
TAX_WORLDLANGUAGES_LUXEMBOURGISH          1
WB_2162_HEALTHY_AGING                     1
Name: count, Length: 11336, dtype: int64