In [1]:
# your parameters here...
upstream = ['fetch_n_filter_gdelt_bq', 'normalize_security_names']

In [2]:
# Parameters
upstream = {
    "normalize_security_names": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/normalize_security_names.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/normalized_security_names.csv",
    },
    "fetch_n_filter_gdelt_bq": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
    },
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/clean_gdelt_data.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
}


In [3]:
# your code here...
import pandas as pd
import numpy as np
from pathlib import Path
import nltk
import time
import re
from time import time
import json
import ast
from src.utils import preprocess_text
from collections import Counter
from src.utils import fuzz_similarity, preprocess_text
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
nltk.download('omw-1.4')
stops = set(stopwords.words('english'))
pd.options.display.max_colwidth = 200

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aiujdm2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/aiujdm2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
gdelt_file_path = upstream['fetch_n_filter_gdelt_bq']['data']
securities_file_path = upstream['normalize_security_names']['data']
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
securities_df = pd.read_csv(securities_file_path, index_col=0)
securities_df.dropna(subset=['former_name'], inplace=True)
securities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4477 entries, 0 to 9212
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cik          4477 non-null   int64 
 1   ticker       4477 non-null   object
 2   full_name    4477 non-null   object
 3   former_name  4477 non-null   object
dtypes: int64(1), object(3)
memory usage: 174.9+ KB


In [6]:
gdelt_df.columns

Index(['GKGRECORDID', 'DATE', 'SourceCollectionIdentifier',
       'DocumentIdentifier', 'Locations', 'Persons', 'Organizations', 'Tone'],
      dtype='object')

In [7]:
securities_df.head()

Unnamed: 0,cik,ticker,full_name,former_name
0,1599298,SMMT,summit therapeutics,summit therapeutics
3,33934,CRF,cornerstone total return,eis
6,845611,GCV,gabelli convertible income securities,gabelli convertible securities
10,1839412,QTEK,qualtek services,roth ch acquisition iii
13,851520,EXPO,exponent,failure


In [8]:
#  drop rows where no organizations names were extracted
gdelt_df = gdelt_df.dropna(subset=['Organizations'])
gdelt_df = gdelt_df.replace(to_replace= ['Google', 'Facebook', 'YouTube', 'Youtube'], value=['alphabet', 'meta platforms', 'alphabet', 'alphabet'], regex=True)

In [9]:
#  get rid of numeric position of org name mention by only extracting alpha names
# Extract only names not the index
gdelt_df.Organizations = gdelt_df.Organizations.map(lambda x: re.split(r',\d+;?', x))

In [10]:
def preprocess_orgs(x):
    return preprocess_text(x, eng=True)

gdelt_df.Organizations = gdelt_df.Organizations.apply(preprocess_orgs)

In [11]:
gdelt_df.Organizations = gdelt_df.Organizations.replace(to_replace=securities_df.former_name.to_list(), value=securities_df.full_name.to_list())

In [12]:
def count_orgs(x):
    return x if type(x) is float or len(x) == 0 else Counter(x)

gdelt_df.Organizations = gdelt_df.Organizations.apply(count_orgs)

In [13]:
securities_names = preprocess_text(securities_df.full_name)
gdelt_df.Organizations = gdelt_df.Organizations.apply(lambda x: { key:x[key] for key in x.keys() if key in securities_names and key != 'Tooshorttext'})
gdelt_df = gdelt_df[gdelt_df.Organizations.str.len() != 0]

In [14]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8912 entries, 0 to 10551
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 8912 non-null   object
 1   DATE                        8912 non-null   int64 
 2   SourceCollectionIdentifier  8912 non-null   int64 
 3   DocumentIdentifier          8912 non-null   object
 4   Locations                   7523 non-null   object
 5   Persons                     7385 non-null   object
 6   Organizations               8912 non-null   object
 7   Tone                        8912 non-null   object
dtypes: int64(2), object(6)
memory usage: 626.6+ KB


In [15]:
def split_locations(location_list):
    location_names = []
    if type(location_list) is not float:
        for location_string in location_list:
            loc_parts = location_string.split('#')
            # We are only interested in full location name which is second entry in location string           
            location_names.append(loc_parts[1]) if len(loc_parts) > 1 else np.nan
            
    return location_names
# Locations are semi-colons(;) seperated and each location string is further seperated by hash(#)
gdelt_df.Locations = gdelt_df.Locations.str.split(';').apply(split_locations)
#  Remove duplicates
gdelt_df.Locations = gdelt_df.Locations.map(set)

In [16]:
#  Clean some data elements
gdelt_df.Persons = gdelt_df.Persons.str.findall(pat="[A-Z][a-z]+ [A-Z][a-z]+")
#  Remove duplicates
gdelt_df.Persons = gdelt_df.Persons.map(set, na_action='ignore')

In [17]:
gdelt_df.Tone = gdelt_df.Tone.str.split(',')

In [18]:
# Clean Tone
gdelt_df['AvgTone'] = gdelt_df.Tone.apply(lambda x: x[0])
gdelt_df['PosScore'] = gdelt_df.Tone.apply(lambda x: x[1])
gdelt_df['NegScore'] = gdelt_df.Tone.apply(lambda x: x[2])
gdelt_df['Polarity'] = gdelt_df.Tone.apply(lambda x: x[3])

In [19]:
gdelt_df.drop(["Tone", "DATE", "SourceCollectionIdentifier", "DocumentIdentifier"], axis = 1, inplace=True)

In [20]:
gdelt_df.sample(10)

Unnamed: 0,GKGRECORDID,Locations,Persons,Organizations,AvgTone,PosScore,NegScore,Polarity
431,20211220113000-727,"{American, Massachusetts, United States, Nigeria, South African}","{Elizabeth Warren, Elon Musk}","{'twitter': 1, 'tesla': 1}",1.70648464163823,3.41296928327645,1.70648464163823,5.11945392491468
5766,20220417070000-110,{Americans},,"{'tesla': 1, 'meta platforms': 1, 'nvidia': 1, 'netflix': 1}",1.98300283286119,4.95750708215297,2.97450424929179,7.93201133144476
7038,20220415140000-620,"{Mexico, El Salvador, Mexican, Honduras, Miami, Florida, United States, Portuguese}","{Aparna Chennapragada, Michael Saylor, Jack Mallers, Clifford Stoll, Indira Kempis, Cathie Wood, Francis Suarez, Warren Buffett}","{'tesla': 1, 'apple': 1, 'costco wholesale': 1}",1.02803738317757,2.80373831775701,1.77570093457944,4.57943925233645
7215,20220307233000-1386,"{Melbourne, Victoria, Australia, Tampa, Florida, United States, Berlin, Berlin, Germany, Russia, Saudi Arabia, Ukraine, Russian, Iran}","{Piper Sandler, Jared Dinges, Antony Blinken}","{'ralph lauren': 2, 'mosaic': 3, 'nasdaq': 10, 'schlumberger': 4}",-1.69312169312169,0.846560846560847,2.53968253968254,3.38624338624339
7281,20220304210000-1414,"{Ukraine, Russia, Russian, Texas, United States}","{Morgan Stanley, Raymond James, Gary Steele, Elon Musk}","{'morgan stanley': 2, 'newmont': 2, 'nasdaq': 13, 'broadcom': 3}",-0.116009280742459,1.85614849187935,1.97215777262181,3.82830626450116
2431,20220106183000-1747,{Italian},"{David Keller, Sam Abuelsamid, Emmanuel Rosner}","{'thomson reuters': 1, 'tesla': 1, 'lucid': 1}",-1.03359173126615,1.80878552971576,2.84237726098191,4.65116279069767
6149,20220217203000-1546,"{Ohio, United States}",{Henrik Fisker},{'tesla': 1},2.17755443886097,2.84757118927973,0.670016750418761,3.51758793969849
10427,20220406153000-T1801,"{Opstina Palilula, 00, Serbia, Belgrade, Serbia (General), }",,{'tesla': 3},1.13636363636364,1.13636363636364,0.0,1.13636363636364
2147,20220209034500-638,"{Florida, United States, Fort Lauderdale, Florida, United States, Nevada, United States}","{James Riley, Barrett Riley}",{'tesla': 1},-3.46740638002774,1.94174757281553,5.40915395284327,7.35090152565881
6933,20220405230000-303,"{California, United States}","{Jack Dorsey, Michael Hiltzik, Parag Agrawal, Donald Trump, Mark Zuckerberg, Elon Musk, Matt Rourke}","{'twitter': 29, 'tesla': 1}",-2.73410799726589,2.32399179767601,5.0580997949419,7.38209159261791


In [21]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
gdelt_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv


In [22]:
del gdelt_df, securities_df