In [1]:
# your parameters here...
upstream = ['fetch_n_filter_gdelt_bq', 'normalize_security_names']

In [2]:
# Parameters
upstream = {
    "normalize_security_names": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/normalize_security_names.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/normalized_security_names.csv",
    },
    "fetch_n_filter_gdelt_bq": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
    },
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/clean_gdelt_data.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
}


In [3]:
# your code here...
import pandas as pd
import numpy as np
from pathlib import Path
import nltk
import time
import re
from time import time
import json
import ast
from src.utils import preprocess_text
from collections import Counter
from src.utils import fuzz_similarity, preprocess_text
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
nltk.download('omw-1.4')
stops = set(stopwords.words('english'))
pd.options.display.max_colwidth = 200

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aiujdm2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/aiujdm2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
gdelt_file_path = upstream['fetch_n_filter_gdelt_bq']['data']
securities_file_path = upstream['normalize_security_names']['data']
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
securities_df = pd.read_csv(securities_file_path, index_col=0)
securities_df.dropna(subset=['former_name'], inplace=True)
securities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4475 entries, 0 to 9211
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cik          4475 non-null   int64 
 1   ticker       4475 non-null   object
 2   full_name    4475 non-null   object
 3   former_name  4475 non-null   object
dtypes: int64(1), object(3)
memory usage: 174.8+ KB


In [6]:
gdelt_df.columns

Index(['GKGRECORDID', 'DATE', 'SourceCollectionIdentifier',
       'DocumentIdentifier', 'Locations', 'Persons', 'Organizations', 'Tone'],
      dtype='object')

In [7]:
securities_df.head()

Unnamed: 0,cik,ticker,full_name,former_name
0,1052752,GTY,getty realty,getty realty
2,1642178,AGNPF,algernon pharmaceuticals,breathtec biomedical
3,815556,FAST,fastenal,fastenal
5,1818644,LIDR,aeye,cf finance acquisition iii
6,924805,FRHC,freedom,bmb munai


In [8]:
#  drop rows where no organizations names were extracted
gdelt_df = gdelt_df.dropna(subset=['Organizations'])
gdelt_df = gdelt_df.replace(to_replace= ['Google', 'Facebook', 'YouTube', 'Youtube'], value=['alphabet', 'meta platforms', 'alphabet', 'alphabet'], regex=True)

In [9]:
#  get rid of numeric position of org name mention by only extracting alpha names
# Extract only names not the index
gdelt_df.Organizations = gdelt_df.Organizations.map(lambda x: re.split(r',\d+;?', x))

In [10]:
def preprocess_orgs(x):
    return preprocess_text(x, eng=True)

gdelt_df.Organizations = gdelt_df.Organizations.apply(preprocess_orgs)

In [11]:
gdelt_df.Organizations = gdelt_df.Organizations.replace(to_replace=securities_df.former_name.to_list(), value=securities_df.full_name.to_list())

In [12]:
def count_orgs(x):
    return x if type(x) is float or len(x) == 0 else Counter(x)

gdelt_df.Organizations = gdelt_df.Organizations.apply(count_orgs)

In [13]:
securities_names = preprocess_text(securities_df.full_name)
gdelt_df.Organizations = gdelt_df.Organizations.apply(lambda x: { key:x[key] for key in x.keys() if key in securities_names and key != 'Tooshorttext'})
gdelt_df = gdelt_df[gdelt_df.Organizations.str.len() != 0]

In [14]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8912 entries, 0 to 10551
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 8912 non-null   object
 1   DATE                        8912 non-null   int64 
 2   SourceCollectionIdentifier  8912 non-null   int64 
 3   DocumentIdentifier          8912 non-null   object
 4   Locations                   7523 non-null   object
 5   Persons                     7385 non-null   object
 6   Organizations               8912 non-null   object
 7   Tone                        8912 non-null   object
dtypes: int64(2), object(6)
memory usage: 626.6+ KB


In [15]:
def split_locations(location_list):
    location_names = []
    if type(location_list) is not float:
        for location_string in location_list:
            loc_parts = location_string.split('#')
            # We are only interested in full location name which is second entry in location string           
            location_names.append(loc_parts[1]) if len(loc_parts) > 1 else np.nan
            
    return location_names
# Locations are semi-colons(;) seperated and each location string is further seperated by hash(#)
gdelt_df.Locations = gdelt_df.Locations.str.split(';').apply(split_locations)
#  Remove duplicates
gdelt_df.Locations = gdelt_df.Locations.map(set)

In [16]:
#  Clean some data elements
gdelt_df.Persons = gdelt_df.Persons.str.findall(pat="[A-Z][a-z]+ [A-Z][a-z]+")
#  Remove duplicates
gdelt_df.Persons = gdelt_df.Persons.map(set, na_action='ignore')

In [17]:
gdelt_df.Tone = gdelt_df.Tone.str.split(',')

In [18]:
# Clean Tone
gdelt_df['AvgTone'] = gdelt_df.Tone.apply(lambda x: x[0])
gdelt_df['PosScore'] = gdelt_df.Tone.apply(lambda x: x[1])
gdelt_df['NegScore'] = gdelt_df.Tone.apply(lambda x: x[2])
gdelt_df['Polarity'] = gdelt_df.Tone.apply(lambda x: x[3])

In [19]:
gdelt_df.drop(["Tone", "DATE", "SourceCollectionIdentifier", "DocumentIdentifier"], axis = 1, inplace=True)

In [20]:
gdelt_df.sample(10)

Unnamed: 0,GKGRECORDID,Locations,Persons,Organizations,AvgTone,PosScore,NegScore,Polarity
1011,20220104080000-510,"{Beijing, Beijing, China, Shanghai, Shanghai, China, American, Washington, Washington, United States, United States, Taiwan, China, Chinese, Xinjiang, Jiangxi, China}","{Elon Musk, Ibrahim Hooper}","{'walmart': 3, 'tesla': 1}",-4.0920716112532,2.0460358056266,6.1381074168798,8.18414322250639
4979,20220126100000-1142,"{Fremont, California, United States, South Carolina, United States, Japanese, Spartanburg, South Carolina, United States, Shanghai, Shanghai, China, American, Germany, Berlin, Berlin, Germany, Chi...",,{'tesla': 1},0.0,1.56599552572707,1.56599552572707,3.13199105145414
5695,20220214080000-869,"{Tokyo, Tokyo, Japan, Japan, Japanese, United States}","{Satoko Matsushima, Hideyuki Ishiguro, Mitsushige Akino}",{'tesla': 1},0.255427841634738,2.04342273307791,1.78799489144317,3.83141762452107
4226,20220322021500-646,"{Americans, United States, America}","{David Massey, Valuewalk Pixaline, Warren Buffett, Larry Fink}","{'nvidia': 1, 'nasdaq': 13, 'tesla': 1, 'home depot': 1, 'meta platforms': 1, 'blackrock': 15, 'clearway': 2, 'apple': 1}",1.97250448296473,3.82546323968918,1.85295875672445,5.67842199641363
933,20220115104500-470,{Chinese},,"{'gamestop': 3, 'ocugen': 1, 'tesla': 1, 'meta platforms': 2}",0.28735632183908,1.72413793103448,1.4367816091954,3.16091954022989
7154,20220406223000-T823,"{New York, United States, United States}","{Donald Trump, Elon Musk}","{'twitter': 24, 'tesla': 1}",-2.42914979757085,1.72064777327935,4.1497975708502,5.87044534412955
7120,20220313004500-T560,"{Russian, Serbian, Qatar, Hollywood, California, United States, Serbia, Ukraine, Belgrade, Serbia (General), }",,{'tesla': 1},9.1970802919708,11.8248175182482,2.62773722627737,14.4525547445255
4239,20220208093000-964,"{California, United States, Michigan, United States}","{Tesla Cybertruck, Teejay Boris, Elon Musk, Sawyer Merritt}","{'twitter': 1, 'tesla': 2}",-1.83206106870229,1.06870229007634,2.90076335877863,3.96946564885496
288,20211222081500-489,"{Berlin, Berlin, Germany, Germany, German}",{Axel Vogel},{'tesla': 1},-0.72463768115942,1.44927536231884,2.17391304347826,3.6231884057971
1899,20220202040000-584,"{California, United States, Alameda County, California, United States, Americans, American}","{Tesla Lathrop, Owen Diaz, Kaylen Barker, Bernard Alexander, Tesla Fremont}",{'tesla': 1},-7.04761904761905,1.33333333333333,8.38095238095238,9.71428571428571


In [21]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
gdelt_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv


In [22]:
del gdelt_df, securities_df