In [2]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime
warnings.filterwarnings('ignore')

In [3]:
df = pl.read_csv('../../data/processed/gdelt_combined_20250610_1716.csv')

In [4]:
df.shape

(1269745, 3019)

In [5]:
v_cols_drop = [i for i in df.columns if i.startswith("v") and "v19" in i]

In [6]:
# http://data.gdeltproject.org/documentation/GCAM-MASTER-CODEBOOK.TXT
c_columns = [i for i in df.columns if i.startswith("c")]
c_cols_to_keep = [
    "c3.","c4.1;","c4.16","c6.","c16.60","c41.","c18.1;","c18.2;","c18.21;","c18.30;","c18.33;","c18.50;","c18.66;","c18.68;","c18.83;","c18.121;","c18.137;","c18.157;","c18.164;","c18.254;",
]
c_cols_keep = []
for c in c_cols_to_keep:
    c_cols_keep.extend([i for i in c_columns if c in i])
c_cols_keep.extend([i for i in c_columns if "c18." in i and "ECON_" in i])
c_cols_drop = [i for i in c_columns if i not in c_cols_keep]

In [7]:
df = df.drop(v_cols_drop)
df = df.drop(c_cols_drop)

In [8]:
# For each column, drop if all values are equal to 0
for col in df.columns:
    if df[col].min() == df[col].max():
        df = df.drop(col)
df.shrink_to_fit(in_place=True)

# Convert to smaller data types
df = df.select(pl.all().shrink_dtype())

In [9]:
# Remove a list of titles that are probably home pages or other non-article content
titles_to_remove = [
    'News briefs',
    'Latest Articles',
    'The Nashville Ledger',
    'Today in History',
    'National News - Media One Radio Group (WWSE | WJTN | WHUG | WKSN | WQFX',
    "Aero-News Network: The aviation and aerospace world's daily/real-time news and information service",
    'Drake & 21 Savage Add More Texas Concert Dates Due To High Demand',
    'Stock Market | FinancialContent Business Page',
    'Radio Station WHMI 93.5 FM &#x2014; Livingston County Michigan News, Weather, Traffic, Sports, School Updates, and the Best Classic Hit',
    'National - KSYL-AM',
    'Business Highlights',
    'Business Highlights',
    'National News - 1540 WADK Newport',
    'National - Carroll Broadcasting Inc.',
    'ABC National - WOND',
    "Breaking National News - 92.7-FM TheDRIVE - Bob & Tom Mornings, Central New York's Best Rock All Day",
    "ABC - National News - Xtra 99.1 FM - Today's Hits and Yesterday's Favorites",
    "Ed Bruce, Legendary Country Songwriter, 'Maverick' Actor, Dead At 81",
    "ABC National News - Beach 95.1 - WBPC Panama City Beach Greatest Hits of the 60s, 70s & 80s",
    "CES gadget show: How watching TV will change in the 2020s",
    "Despite business warnings, GOP moves ahead with voting bills",
    "KTBB.com - News Weather Talk",
    "AP Story",
    "SRN - US News - Taylorville Daily News",
]

# Replace empty article titles with z
df = df.with_columns(
    pl.col('article_title').fill_null('z')
)

df = df.filter(
    ~df['article_title'].is_in(titles_to_remove)
)

# Drop where article_title contains 'AP News in Brief'
df = df.filter(
    ~df['article_title'].str.contains('AP News in Brief at', literal=True)
    )

In [10]:
# Filter to reliable news sites. I don't want to limit to a certain set of sites because I want to keep local news in

# The idea is to remove sites that are unlikely to report about price-influencing stories. For example, some sites report about cheap flights or travel points
sites_to_keep = [
'yahoo.com','msn.com','fool.com','reuters.com','seekingalpha.com','themarketsdaily.com','forbes.com','investing.com','cnn.com','marketscreener.com','washingtonpost.com','nytimes.com','investors.com','tickerreport.com','insidermonkey.com','morningstar.com','abc7news.com','businessinsider.com','prnewswire.com','bnnbloomberg.ca','zerohedge.com','nasdaq.com','marketwatch.com','abc7ny.com','streetinsider.com','apnews.com','econintersect.com','foxbusiness.com','cnbc.com'
]

df = df.filter(
    df['V2SOURCECOMMONNAME'].is_in(sites_to_keep)
)

In [11]:
df=df.to_pandas()
df

Unnamed: 0,GKGRECORDID,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,airplane,airline,airport,Alaska Airlines,...,v42.2; SCOREDVALUE; care_p,v42.3; SCOREDVALUE; fairness_p,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent
0,20250410171500-1578,abc7news.com,https://abc7news.com/post/pluckys-home-grandmo...,AFFECT;CRISISLEX_C04_LOGISTICS_TRANSPORT;CRISI...,2025-04-10 17:15:00,2025-04-10,0,1,0,0,...,0.107544,0.102134,0.098454,0.098136,0.079094,-0.056323,-0.020730,0.014545,-0.042246,-0.050891
1,20231120234500-1109,apnews.com,https://apnews.com/article/business-highlights...,TAX_FNCACT;TAX_FNCACT_CEO;TAX_FNCACT_EMPLOYEES...,2023-11-20 23:45:00,2023-11-20,0,1,0,0,...,0.090224,0.094559,0.085987,0.086894,0.071236,-0.104376,-0.018650,0.000136,-0.011729,-0.016433
2,20230421174500-864,yahoo.com,https://finance.yahoo.com/news/11-best-fast-mo...,ECON_STOCKMARKET;TAX_FNCACT;TAX_FNCACT_TRADERS...,2023-04-21 17:45:00,2023-04-21,0,0,0,0,...,0.087240,0.105688,0.092964,0.085562,0.075519,-0.084906,-0.002264,0.061741,0.009324,-0.011657
3,20230828170000-1294,yahoo.com,https://finance.yahoo.com/news/american-airlin...,WB_135_TRANSPORT;TAX_ETHNICITY;TAX_ETHNICITY_A...,2023-08-28 17:00:00,2023-08-28,0,1,1,0,...,0.103764,0.112283,0.089791,0.105052,0.080620,-0.106268,-0.050945,-0.055337,-0.083869,-0.081310
4,20240423193000-1659,yahoo.com,https://finance.yahoo.com/news/jetblue-airways...,TAX_FNCACT;TAX_FNCACT_DIRECTOR;TAX_FNCACT_INVE...,2024-04-23 19:30:00,2024-04-23,0,0,0,0,...,0.097068,0.095004,0.090157,0.089376,0.075623,-0.025508,0.023856,0.035017,0.038171,-0.010678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123317,20200302183000-410,prnewswire.com,https://www.prnewswire.com:443/news-releases/w...,WB_165_AIR_TRANSPORT;WB_135_TRANSPORT;WB_164_M...,2020-03-02 18:30:00,2020-03-02,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
123318,20220817191500-372,streetinsider.com,https://www.streetinsider.com/Globe+Newswire/A...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;TAX_FNCAC...,2022-08-17 19:15:00,2022-08-17,0,1,0,0,...,0.081092,0.090742,0.071966,0.078858,0.069322,-0.087616,0.010403,0.085699,-0.009925,0.020674
123319,20220516161500-649,streetinsider.com,https://www.streetinsider.com/Market+Check/Fut...,ECON_STOCKMARKET;EPU_ECONOMY;EPU_ECONOMY_HISTO...,2022-05-16 16:15:00,2022-05-16,0,1,0,0,...,0.098366,0.097351,0.081841,0.083830,0.073628,-0.106435,-0.050484,0.007180,-0.025901,-0.016121
123320,20220516161500-83,streetinsider.com,https://www.streetinsider.com/Market+Check/Wal...,EPU_ECONOMY;EPU_ECONOMY_HISTORIC;USPEC_POLICY1...,2022-05-16 16:15:00,2022-05-16,0,1,0,0,...,0.098244,0.096695,0.081530,0.084110,0.073321,-0.109945,-0.054062,0.002389,-0.030918,-0.018252


In [12]:
# Extracting headlines from URLs when article title is empty
df['V2DOCUMENTIDENTIFIER'].value_counts()

V2DOCUMENTIDENTIFIER
https://www.cnn.com/2015/08/21/us/carl-icahn-fast-facts/                                                         3
https://www.foxbusiness.com/category/fox-news-airlines                                                           3
https://www.cnn.com/2013/07/09/world/commercial-passenger-airplane-crashes-fast-facts/                           3
https://www.businessinsider.com/personal-finance/capital-one-transfer-partners                                   3
https://www.businessinsider.com/personal-finance/best-credit-cards                                               3
                                                                                                                ..
https://www.washingtonpost.com/travel/2021/05/12/faa-fine-mask-jetblue-covid/                                    1
https://www.zerohedge.com/news/2019-04-22/dhs-face-scan-97-international-travelers-within-four-years             1
http://www.msn.com/en-nz/news/world/europe-economies-show-r

In [13]:
df['url'] = df['V2DOCUMENTIDENTIFIER']
# Remove the protocol (http:// or https://) and the domain name
df['url'].replace(r'^(https?://)', '', regex=True, inplace=True)

# remove anything before .com, .org, .net, etc.
df['url'].replace(r'^[^/]+/', '', regex=True, inplace=True)

# remove strings of atleast 7 numbers
df['url'].replace(r'\d{7,}', '', regex=True, inplace=True)


words_to_remove = ['news/', 'article/', 'forum/', 'entertainment/', 'stories/', 'national/',
                   'national_news/', 'story/', 'travel/', 'articles/', 'us/', 'world/',
                   'world-news', 'blog/', 'nation-world/', 'region/', 'post/', 'recommends/',
                   'headlines/', 'business/', 'ap/', 'business-economy/', '.html', '.htm', 'x/'
]

for word in words_to_remove:
    df['url'].replace(word, '', regex=True, inplace=True)

# remove anything that looks like a date
df['url'].replace(r'\d{4}/\d{2}/\d{2}', '', regex=True, inplace=True)
df['url'].replace(r'\d{4}-\d{2}-\d{2}', '', regex=True, inplace=True)
df['url'].replace(r'20[2][0-9][01][0-9][0-9]{2}', '', regex=True, inplace=True)

df['url'].replace(r'/', ' ', regex=True, inplace=True)
df['url'].replace(r'-', ' ', regex=True, inplace=True)
df['url'].replace(r'_', ' ', regex=True, inplace=True)
df['url'].replace(r'\.', ' ', regex=True, inplace=True)
df['url'].replace(r'\?', ' ', regex=True, inplace=True)
df['url'].replace(r'  ', ' ', regex=True, inplace=True)

df['url']=df['url'].str.lstrip()
df['url']=df['url'].str.rstrip()
df['url']=df['url'].str.lower()

words_to_remove = ['syndicated id=', 'article', 'usubmit', 'nation article', 'ap', 'nation ', 
                   'news briefs t=','content','viewtopic php f=3&t=',' cfm c_id=3&objectid=',' cfm c_id=2&objectid=',
                   'national','latest','cfm c_id=7&objectid=','story aspx id=','post_type=news&p=','latest','world us canada',
                   'npr story storyid=','p=','tag * index more=','latest article'
]

for word in words_to_remove:
    df['url'] = np.where(df['url']==word, '', df['url'])

df['url'].replace(r'zz ', '', regex=True, inplace=True)

df['url']=df['url'].str.lstrip()
df['url']=df['url'].str.rstrip()

In [14]:
# replace article title with url if article title is empty
df['article_title'] = df['article_title'].str.lower()
df['article_title'] = np.where(df['article_title'] == 'z', df['url'], df['article_title'])

df['article_title'].replace(r'/', ' ', regex=True, inplace=True)
df['article_title'].replace(r'-', ' ', regex=True, inplace=True)
df['article_title'].replace(r'\.', ' ', regex=True, inplace=True)
df['article_title'].replace(r'\?', ' ', regex=True, inplace=True)
df['article_title'].replace(r"'", ' ', regex=True, inplace=True)
df['article_title'].replace(r",", ' ', regex=True, inplace=True)

# remove strings of at least 8 characters that contain both letters and numbers
df['article_title'].replace(r'\b(?=\w*[a-zA-Z])(?=\w*[0-9])\w{8,}\b', ' ', regex=True, inplace=True)

df['article_title'].replace(r' +', ' ', regex=True, inplace=True)

df['article_title'] = np.where(df['article_title'] == 'z', '', df['article_title'])
df['article_title'] = df['article_title'].str.strip()
df['article_title'] = df['article_title'].str.lower()

In [15]:
df.drop(columns=['url'], inplace=True)

In [16]:
# Drop records where article title starts with
for s in ['article cfm c id=','external php s=','starttime=','post type=news&p=',
          'h article=','p=','page=','default aspx','syndicated id=','article aspx id=']:
    df = df[~df['article_title'].str.startswith(s, na=False)]

for word in ['national article','business','local article','national',
             'news and closings national','abc business','abc','national hits fm',
             'world hits fm','story','id']:
    df = df[df['article_title'] != word]

# Drop records where article title is all numbers
df = df[~df['article_title'].str.match(r'^\s*[0-9]+(\s+[0-9]+)*\s*$', na=False)]

# Drop records where article title is empty
df = df[~df['article_title'].str.strip().eq('')]

In [17]:
# Random popular article that mentions an airline but as an aside
# https://nationalpost.com/pmn/news-pmn/mighty-mississippi-scientists-use-model-in-land-loss-fight
df = df[~df['article_title'].str.contains('mighty mississippi')]
df = df[~df['article_title'].str.contains('mississippi model')]

df = df[~df['article_title'].str.contains('ted kaczynski')]
df = df[~df['article_title'].str.contains('sexually harassed')]

In [18]:
# Remove 9/11 and related historical articles
for word in ['9 11','september 11','sept 11','on this day','the year in','year in review','lessons learned in','top stories']:
    df = df[~df['article_title'].str.contains(word, na=False)]

for word in ['history','historical','today in history']:
    df = df[~df['article_title'].str.startswith(word, na=False)]

In [19]:
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)
df.shape

(121655, 144)

In [21]:
list(df.columns)

['GKGRECORDID',
 'V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'Tone',
 'Positive Score',
 'Negative Score',
 'Polarity',
 'Activity Reference Density',
 'Self/Group Reference Density',
 'Word Count',
 'c3.1; WORDCOUNT; NEGATIVE',
 'c3.2; WORDCOUNT; POSITIVE',
 'c3.3; WORDCOUNT; NEG_NEGATIVE',
 'c3.4; WORDCOUNT; NEG_POSITIVE',
 'c4.1; WORDCOUNT; MACROECONOMICS',
 'c4.16; WORDCOUNT; FINANCE',
 'c6.1; WORDCOUNT; Litigious',
 'c6.2; WORDCOUNT; ModalStrong',
 'c6.3; WORDCOUNT; ModalWeak',
 'c6.4; WORDCOUNT; Negative',
 'c6.5; WORDCOUNT; Positive',
 'c6.6; WORDCOUNT; Uncertainty',
 'v10.1; SCOREDVALUE; Positive (Scored Value)',
 'v10.2; SCOREDVALUE; Negative (Scored Value)',
 'v11

In [68]:
# Save df to pickle file
with open('../../data/processed/gdelt_intermediate_cleaned_finance.pkl', 'wb') as f:
    pickle.dump(df, f)