In [115]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime
warnings.filterwarnings('ignore')

In [116]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)
times = list(stock_data.index.unique())

In [117]:
df = pl.read_csv('../../data/processed/gdelt_combined_20250610_1716.csv')

In [118]:
# For each column, drop if all values are equal to 0
for col in df.columns:
    if df[col].min() == df[col].max():
        df = df.drop(col)
df.shrink_to_fit(in_place=True)

# Convert to smaller data types
df = df.select(pl.all().shrink_dtype())

In [119]:
# Drop duplicate rows
df = df.unique(maintain_order=True)

In [120]:
# Remove a list of titles that are probably home pages or other non-article content
titles_to_remove = [
    'News briefs',
    'Latest Articles',
    'The Nashville Ledger',
    'Today in History',
    'National News - Media One Radio Group (WWSE | WJTN | WHUG | WKSN | WQFX',
    "Aero-News Network: The aviation and aerospace world's daily/real-time news and information service",
    'Drake & 21 Savage Add More Texas Concert Dates Due To High Demand',
    'Stock Market | FinancialContent Business Page',
    'Radio Station WHMI 93.5 FM &#x2014; Livingston County Michigan News, Weather, Traffic, Sports, School Updates, and the Best Classic Hit',
    'National - KSYL-AM',
    'Business Highlights',
    'Business Highlights',
    'National News - 1540 WADK Newport',
    'National - Carroll Broadcasting Inc.',
    'ABC National - WOND',
    "Breaking National News - 92.7-FM TheDRIVE - Bob & Tom Mornings, Central New York's Best Rock All Day",
    "ABC - National News - Xtra 99.1 FM - Today's Hits and Yesterday's Favorites",
    "Ed Bruce, Legendary Country Songwriter, 'Maverick' Actor, Dead At 81",
    "ABC National News - Beach 95.1 - WBPC Panama City Beach Greatest Hits of the 60s, 70s & 80s",
    "CES gadget show: How watching TV will change in the 2020s",
    "Despite business warnings, GOP moves ahead with voting bills",
    "KTBB.com - News Weather Talk",
    "AP Story",
    "SRN - US News - Taylorville Daily News",
]

# Replace empty article titles with z
df = df.with_columns(
    pl.col('article_title').fill_null('z')
)

df = df.filter(
    ~df['article_title'].is_in(titles_to_remove)
)

# Drop where article_title contains 'AP News in Brief'
df = df.filter(
    ~df['article_title'].str.contains('AP News in Brief at', literal=True)
    )

In [121]:
site_counts = df['V2SOURCECOMMONNAME'].value_counts().to_pandas()
site_counts

Unnamed: 0,V2SOURCECOMMONNAME,count
0,montreal.citynews.ca,2
1,gcaptain.com,11
2,ukrinform.net,2
3,suntci.com,3
4,theboot.com,29
...,...,...
14861,heritageradionetwork.org,3
14862,proplayerinsiders.com,1
14863,power923.com,5
14864,hardwoodparoxysm.com,7


In [122]:
# Filter to reliable news sites. I don't want to limit to a certain set of sites because I want to keep local news in

# The idea is to remove sites that are unlikely to report about price-influencing stories. For example, some sites report about cheap flights or travel points
sites_to_remove = [
'iheart.com',
'thepointsguy.com',
'eturbonews.com',
'indiatimes.com',
'forimmediaterelease.net',
'travelmarketreport.com'
]

df = df.filter(
    ~df['V2SOURCECOMMONNAME'].is_in(sites_to_remove)
)

In [123]:
df=df.to_pandas()
df

Unnamed: 0,GKGRECORDID,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,airplane,airline,airport,Alaska Airlines,...,v42.2; SCOREDVALUE; care_p,v42.3; SCOREDVALUE; fairness_p,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent
0,20230221030000-453,aero-news.net,http://aero-news.net/index.cfm?do=main.textpos...,TAX_WORLDREPTILES;TAX_WORLDREPTILES_SIDEWINDER...,2023-02-21 03:00:00,2023-02-21,0,1,0,0,...,0.101615,0.094112,0.084485,0.096285,0.065091,-0.117746,-0.047186,-0.009921,-0.062596,-0.075467
1,20250117234500-581,houstonchronicle.com,http://www.houstonchronicle.com/news/politics/...,TAX_FNCACT;TAX_FNCACT_OFFICIALS;LEADER;USPEC_P...,2025-01-17 23:45:00,2025-01-17,0,1,0,0,...,0.078913,0.075865,0.081348,0.085938,0.067908,-0.102076,-0.046182,-0.011943,-0.043249,-0.057695
2,20240527093000-676,itechpost.com,http://www.itechpost.com/articles/122528/20240...,TAX_FNCACT;TAX_FNCACT_EMPLOYEE;EPU_POLICY;EPU_...,2024-05-27 09:30:00,2024-05-27,0,1,1,0,...,0.109993,0.101007,0.088567,0.090690,0.078992,-0.115803,-0.063174,-0.039071,-0.037974,-0.090853
3,20250410171500-386,1025kiss.com,https://1025kiss.com/ixp/175/p/lukas-first-gam...,,2025-04-10 17:15:00,2025-04-10,0,1,0,0,...,0.098206,0.092551,0.084106,0.084379,0.078339,-0.094771,-0.025998,0.001789,-0.025604,-0.013816
4,20241120131500-415,710keel.com,https://710keel.com/ixp/182/p/spirit-airlines-...,ECON_DEBT;WB_1104_MACROECONOMIC_VULNERABILITY_...,2024-11-20 13:15:00,2024-11-20,0,1,0,0,...,0.109762,0.106656,0.100371,0.092308,0.088900,-0.050954,-0.023068,0.021486,0.009874,-0.006657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150816,20221216140000-613,wsiu.org,https://www.wsiu.org/2022-12-16/they-flew-to-n...,TAX_WORLDMAMMALS;TAX_WORLDMAMMALS_DOG;TAX_ETHN...,2022-12-16 14:00:00,2022-12-16,0,1,0,0,...,0.096294,0.093640,0.083392,0.071799,0.066122,-0.100614,-0.043287,-0.001136,-0.007722,-0.044429
1150817,20190719014500-2143,wtol.com,https://www.wtol.com/article/news/nation-world...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1150818,20190719014500-887,wwltv.com,https://www.wwltv.com/article/news/nation-worl...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1150819,20180120063000-1245,yahoo.com,https://www.yahoo.com/news/delta-apos-cracking...,UNREST_CRACKDOWN;TAX_WORLDBIRDS;TAX_WORLDBIRDS...,2018-01-20 06:30:00,2018-01-20,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [124]:
# Extracting headlines from URLs when article title is empty
df['V2DOCUMENTIDENTIFIER'].value_counts()

V2DOCUMENTIDENTIFIER
https://www.newsnow.co.uk/h/World+News/US/States/Texas/Business                       3
https://www.wyomingpublicmedia.org/people/marisa-penaloza                             3
https://www.aerospace-technology.com/comment/airline-industry-crossroads/             3
https://americanfreepress.net/friendly-skies-look-much-like-americas-police-state/    3
https://www.nbcwashington.com/tag/san-diego/                                          3
                                                                                     ..
https://biztoc.com/x/3da0102d03022012                                                 1
https://biztoc.com/x/3f8e8e21929e86d1                                                 1
https://biztoc.com/x/700e89029304a44f                                                 1
https://biztoc.com/x/f16c18a1d37bd1d3                                                 1
https://biztoc.com/x/f927d7d43c58482f                                                 1
Name: count

In [125]:
df['url'] = df['V2DOCUMENTIDENTIFIER']
# Remove the protocol (http:// or https://) and the domain name
df['url'].replace(r'^(https?://)', '', regex=True, inplace=True)

# remove anything before .com, .org, .net, etc.
df['url'].replace(r'^[^/]+/', '', regex=True, inplace=True)

# remove strings of atleast 7 numbers
df['url'].replace(r'\d{7,}', '', regex=True, inplace=True)


words_to_remove = ['news/', 'article/', 'forum/', 'entertainment/', 'stories/', 'national/',
                   'national_news/', 'story/', 'travel/', 'articles/', 'us/', 'world/',
                   'world-news', 'blog/', 'nation-world/', 'region/', 'post/', 'recommends/',
                   'headlines/', 'business/', 'ap/', 'business-economy/', '.html', '.htm', 'x/'
]

for word in words_to_remove:
    df['url'].replace(word, '', regex=True, inplace=True)

# remove anything that looks like a date
df['url'].replace(r'\d{4}/\d{2}/\d{2}', '', regex=True, inplace=True)
df['url'].replace(r'\d{4}-\d{2}-\d{2}', '', regex=True, inplace=True)
df['url'].replace(r'20[2][0-9][01][0-9][0-9]{2}', '', regex=True, inplace=True)

df['url'].replace(r'/', ' ', regex=True, inplace=True)
df['url'].replace(r'-', ' ', regex=True, inplace=True)
df['url'].replace(r'_', ' ', regex=True, inplace=True)
df['url'].replace(r'\.', ' ', regex=True, inplace=True)
df['url'].replace(r'\?', ' ', regex=True, inplace=True)
df['url'].replace(r'  ', ' ', regex=True, inplace=True)

df['url']=df['url'].str.lstrip()
df['url']=df['url'].str.rstrip()
df['url']=df['url'].str.lower()

words_to_remove = ['syndicated id=', 'article', 'usubmit', 'nation article', 'ap', 'nation ', 
                   'news briefs t=','content','viewtopic php f=3&t=',' cfm c_id=3&objectid=',' cfm c_id=2&objectid=',
                   'national','latest','cfm c_id=7&objectid=','story aspx id=','post_type=news&p=','latest','world us canada',
                   'npr story storyid=','p=','tag * index more=','latest article'
]

for word in words_to_remove:
    df['url'] = np.where(df['url']==word, '', df['url'])

df['url'].replace(r'zz ', '', regex=True, inplace=True)

df['url']=df['url'].str.lstrip()
df['url']=df['url'].str.rstrip()

df['url'].value_counts()

url
                                                                                18962
article cfm c id=3&objectid=                                                      485
article cfm c id=2&objectid=                                                      431
sheriff naked man threatened to bomb florida airport                              349
national article                                                                  331
                                                                                ...  
fl orlando ap online ground max jets could contribute to higher summer fares        1
bad bunny concert san antonio php                                                   1
320595 jetblue goes hostile in takeover bid of spirit                               1
panicking plane passenger was almost sucked into jet engine                         1
index cfm do=main textpost&id=aaace824 9709 49a0 924e e784fec93b6b                  1
Name: count, Length: 727479, dtype: int64

In [126]:
# replace article title with url if article title is empty
df['article_title'] = df['article_title'].str.lower()
df['article_title'] = np.where(df['article_title'] == 'z', df['url'], df['article_title'])

df['article_title'].replace(r'/', ' ', regex=True, inplace=True)
df['article_title'].replace(r'-', ' ', regex=True, inplace=True)
df['article_title'].replace(r'\.', ' ', regex=True, inplace=True)
df['article_title'].replace(r'\?', ' ', regex=True, inplace=True)
df['article_title'].replace(r"'", ' ', regex=True, inplace=True)
df['article_title'].replace(r",", ' ', regex=True, inplace=True)

# remove strings of at least 8 characters that contain both letters and numbers
df['article_title'].replace(r'\b(?=\w*[a-zA-Z])(?=\w*[0-9])\w{8,}\b', ' ', regex=True, inplace=True)

df['article_title'].replace(r' +', ' ', regex=True, inplace=True)

df['article_title'] = np.where(df['article_title'] == 'z', '', df['article_title'])
df['article_title'] = df['article_title'].str.strip()
df['article_title'] = df['article_title'].str.lower()

In [127]:
df.drop(columns=['url'], inplace=True)

In [128]:
df.shape

(1150821, 2478)

In [129]:
# Drop records where article title starts with
for s in ['article cfm c id=','external php s=','starttime=','post type=news&p=',
          'h article=','p=','page=','default aspx','syndicated id=','article aspx id=']:
    df = df[~df['article_title'].str.startswith(s, na=False)]

for word in ['national article','business','local article','national',
             'news and closings national','abc business','abc','national hits fm',
             'world hits fm','story','id']:
    df = df[df['article_title'] != word]

# Drop records where article title is all numbers
df = df[~df['article_title'].str.match(r'^\s*[0-9]+(\s+[0-9]+)*\s*$', na=False)]

# Drop records where article title is empty
df = df[~df['article_title'].str.strip().eq('')]

In [130]:
df.shape

(1136208, 2478)

In [131]:
# Random popular article that mentions an airline but as an aside
# https://nationalpost.com/pmn/news-pmn/mighty-mississippi-scientists-use-model-in-land-loss-fight
df = df[~df['article_title'].str.contains('mighty mississippi')]
df = df[~df['article_title'].str.contains('mississippi model')]

In [132]:
# Remove 9/11 and related historical articles
for word in ['9 11','september 11','sept 11','on this day','the year in','year in review','lessons learned in','top stories']:
    df = df[~df['article_title'].str.contains(word, na=False)]

for word in ['history','historical','today in history']:
    df = df[~df['article_title'].str.startswith(word, na=False)]

In [133]:
df.shape

(1122712, 2478)

In [134]:
# Identify buzzwords in article title that indicate disasters
buzzwords = [
    'disaster','explosion','explode','accident','crash','collision','catastrophe',
    'dead','death','fatal','injury','injured','victim','casualty',
    'killed'
]

# Create a new column 'disaster' that is True if any of the buzzwords are in the article title
df['disaster'] = df['article_title'].str.contains('|'.join(buzzwords), case=False, na=False)

df[df['disaster'] == True]['article_title'].value_counts()

article_title
plane stolen by suicidal employee crashes near seattle                                                                                             257
faa orders fan blade inspections after jet engine explosion                                                                                        237
11 people seriously injured amid turbulence on hawaii flight                                                                                       229
everyone aboard an american airlines jet that collided with an army helicopter is feared dead                                                      222
                                                                                                                                                  ... 
trump says american airlines army helicopter crash should have been prevented                                                                        1
passenger jet and army helicopter collide and crash in washington               

In [135]:
df.sort_values(by='datetime')['article_title']

534867    local davenport man returns from puerto rico l...
534884    grandmother in wheelchair falls down escalator...
991301    id bay area new years eve celebrations off to ...
914807    politifact florida here s how st petersburg ma...
914858    washington spokane alaska airlines sued after ...
                                ...                        
301553    solo travelers pay more than groups for econom...
235184    israelis will remember which travel companies ...
235141    scott kirby ceo of united airlines says budget...
37507               ap strange summarybrief at 5:07 p m edt
49622     solo travelers pay more than groups for econom...
Name: article_title, Length: 1122712, dtype: object

In [136]:
df

Unnamed: 0,GKGRECORDID,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,airplane,airline,airport,Alaska Airlines,...,v42.3; SCOREDVALUE; fairness_p,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent,disaster
0,20230221030000-453,aero-news.net,http://aero-news.net/index.cfm?do=main.textpos...,TAX_WORLDREPTILES;TAX_WORLDREPTILES_SIDEWINDER...,2023-02-21 03:00:00,2023-02-21,0,1,0,0,...,0.094112,0.084485,0.096285,0.065091,-0.117746,-0.047186,-0.009921,-0.062596,-0.075467,False
1,20250117234500-581,houstonchronicle.com,http://www.houstonchronicle.com/news/politics/...,TAX_FNCACT;TAX_FNCACT_OFFICIALS;LEADER;USPEC_P...,2025-01-17 23:45:00,2025-01-17,0,1,0,0,...,0.075865,0.081348,0.085938,0.067908,-0.102076,-0.046182,-0.011943,-0.043249,-0.057695,False
2,20240527093000-676,itechpost.com,http://www.itechpost.com/articles/122528/20240...,TAX_FNCACT;TAX_FNCACT_EMPLOYEE;EPU_POLICY;EPU_...,2024-05-27 09:30:00,2024-05-27,0,1,1,0,...,0.101007,0.088567,0.090690,0.078992,-0.115803,-0.063174,-0.039071,-0.037974,-0.090853,False
3,20250410171500-386,1025kiss.com,https://1025kiss.com/ixp/175/p/lukas-first-gam...,,2025-04-10 17:15:00,2025-04-10,0,1,0,0,...,0.092551,0.084106,0.084379,0.078339,-0.094771,-0.025998,0.001789,-0.025604,-0.013816,False
4,20241120131500-415,710keel.com,https://710keel.com/ixp/182/p/spirit-airlines-...,ECON_DEBT;WB_1104_MACROECONOMIC_VULNERABILITY_...,2024-11-20 13:15:00,2024-11-20,0,1,0,0,...,0.106656,0.100371,0.092308,0.088900,-0.050954,-0.023068,0.021486,0.009874,-0.006657,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150816,20221216140000-613,wsiu.org,https://www.wsiu.org/2022-12-16/they-flew-to-n...,TAX_WORLDMAMMALS;TAX_WORLDMAMMALS_DOG;TAX_ETHN...,2022-12-16 14:00:00,2022-12-16,0,1,0,0,...,0.093640,0.083392,0.071799,0.066122,-0.100614,-0.043287,-0.001136,-0.007722,-0.044429,False
1150817,20190719014500-2143,wtol.com,https://www.wtol.com/article/news/nation-world...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,False
1150818,20190719014500-887,wwltv.com,https://www.wwltv.com/article/news/nation-worl...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,False
1150819,20180120063000-1245,yahoo.com,https://www.yahoo.com/news/delta-apos-cracking...,UNREST_CRACKDOWN;TAX_WORLDBIRDS;TAX_WORLDBIRDS...,2018-01-20 06:30:00,2018-01-20,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,False


In [137]:
df['V2SOURCECOMMONNAME'].value_counts()

V2SOURCECOMMONNAME
yahoo.com                 28071
msn.com                   25359
reuters.com                9459
prnewswire.com             6283
dailymail.co.uk            5371
                          ...  
inc-asean.com                 1
b945live.com                  1
computerdealernews.com        1
thecherokeean.com             1
now.guidetoiceland.is         1
Name: count, Length: 14703, dtype: int64

In [138]:
# Save df to pickle file
with open('../../data/processed/gdelt_intermediate_cleaned.pkl', 'wb') as f:
    pickle.dump(df, f)