In [1]:
# your parameters here...
upstream = ['fetch_n_filter_gdelt_bq', 'normalize_security_names']

In [2]:
# Parameters
upstream = {
    "normalize_security_names": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/normalize_security_names.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/normalized_security_names.csv",
    },
    "fetch_n_filter_gdelt_bq": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
    },
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/clean_gdelt_data.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
}


In [3]:
# your code here...
import pandas as pd
import numpy as np
from pathlib import Path
import nltk
import time
import re
from time import time
import json
import ast
from src.utils import preprocess_text
from collections import Counter
from src.utils import fuzz_similarity, preprocess_text
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
nltk.download('omw-1.4')
stops = set(stopwords.words('english'))
pd.options.display.max_colwidth = 200

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aiujdm2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/aiujdm2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
gdelt_file_path = upstream['fetch_n_filter_gdelt_bq']['data']
securities_file_path = upstream['normalize_security_names']['data']
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
securities_df = pd.read_csv(securities_file_path, index_col=0)
securities_df.dropna(subset=['former_name'], inplace=True)
securities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4477 entries, 0 to 9212
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cik          4477 non-null   int64 
 1   ticker       4477 non-null   object
 2   full_name    4477 non-null   object
 3   former_name  4477 non-null   object
dtypes: int64(1), object(3)
memory usage: 174.9+ KB


In [6]:
gdelt_df.columns

Index(['GKGRECORDID', 'DATE', 'SourceCollectionIdentifier',
       'DocumentIdentifier', 'Locations', 'Persons', 'Organizations', 'Tone'],
      dtype='object')

In [7]:
securities_df.head()

Unnamed: 0,cik,ticker,full_name,former_name
0,1599298,SMMT,summit therapeutics,summit therapeutics
3,33934,CRF,cornerstone total return,eis
6,845611,GCV,gabelli convertible income securities,gabelli convertible securities
10,1839412,QTEK,qualtek services,roth ch acquisition iii
13,851520,EXPO,exponent,failure


In [8]:
#  drop rows where no organizations names were extracted
gdelt_df = gdelt_df.dropna(subset=['Organizations'])
gdelt_df = gdelt_df.replace(to_replace= ['Google', 'Facebook', 'YouTube', 'Youtube'], value=['alphabet', 'meta platforms', 'alphabet', 'alphabet'], regex=True)

In [9]:
#  get rid of numeric position of org name mention by only extracting alpha names
# Extract only names not the index
gdelt_df.Organizations = gdelt_df.Organizations.map(lambda x: re.split(r',\d+;?', x))

In [10]:
def preprocess_orgs(x):
    return preprocess_text(x, eng=True)

gdelt_df.Organizations = gdelt_df.Organizations.apply(preprocess_orgs)

In [11]:
gdelt_df.Organizations = gdelt_df.Organizations.replace(to_replace=securities_df.former_name.to_list(), value=securities_df.full_name.to_list())

In [12]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10520 entries, 0 to 10552
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 10520 non-null  object
 1   DATE                        10520 non-null  int64 
 2   SourceCollectionIdentifier  10520 non-null  int64 
 3   DocumentIdentifier          10520 non-null  object
 4   Locations                   8987 non-null   object
 5   Persons                     8435 non-null   object
 6   Organizations               10520 non-null  object
 7   Tone                        10520 non-null  object
dtypes: int64(2), object(6)
memory usage: 739.7+ KB


In [13]:
def count_orgs(x):
    return x if type(x) is float or x is None or len(x) == 0 else Counter(x)

gdelt_df.Organizations = gdelt_df.Organizations.apply(count_orgs)
gdelt_df.dropna(subset=['Organizations'], inplace=True)
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10520 entries, 0 to 10552
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 10520 non-null  object
 1   DATE                        10520 non-null  int64 
 2   SourceCollectionIdentifier  10520 non-null  int64 
 3   DocumentIdentifier          10520 non-null  object
 4   Locations                   8987 non-null   object
 5   Persons                     8435 non-null   object
 6   Organizations               10520 non-null  object
 7   Tone                        10520 non-null  object
dtypes: int64(2), object(6)
memory usage: 739.7+ KB


In [14]:
securities_names = preprocess_text(securities_df.full_name)
gdelt_df.Organizations = gdelt_df.Organizations.apply(lambda x: { key:x[key] for key in x.keys() if key in securities_names and key != 'Tooshorttext'})
gdelt_df = gdelt_df[gdelt_df.Organizations.str.len() != 0]

In [15]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8912 entries, 0 to 10551
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 8912 non-null   object
 1   DATE                        8912 non-null   int64 
 2   SourceCollectionIdentifier  8912 non-null   int64 
 3   DocumentIdentifier          8912 non-null   object
 4   Locations                   7523 non-null   object
 5   Persons                     7385 non-null   object
 6   Organizations               8912 non-null   object
 7   Tone                        8912 non-null   object
dtypes: int64(2), object(6)
memory usage: 626.6+ KB


In [16]:
def split_locations(location_list):
    location_names = []
    if type(location_list) is not float:
        for location_string in location_list:
            loc_parts = location_string.split('#')
            # We are only interested in full location name which is second entry in location string           
            location_names.append(loc_parts[1]) if len(loc_parts) > 1 else np.nan
            
    return location_names
# Locations are semi-colons(;) seperated and each location string is further seperated by hash(#)
gdelt_df.Locations = gdelt_df.Locations.str.split(';').apply(split_locations)
#  Remove duplicates
gdelt_df.Locations = gdelt_df.Locations.map(set)

In [17]:
#  Clean some data elements
gdelt_df.Persons = gdelt_df.Persons.str.findall(pat="[A-Z][a-z]+ [A-Z][a-z]+")
#  Remove duplicates
gdelt_df.Persons = gdelt_df.Persons.map(set, na_action='ignore')

In [18]:
gdelt_df.Tone = gdelt_df.Tone.str.split(',')

In [19]:
# Clean Tone
gdelt_df['AvgTone'] = gdelt_df.Tone.apply(lambda x: x[0])
gdelt_df['PosScore'] = gdelt_df.Tone.apply(lambda x: x[1])
gdelt_df['NegScore'] = gdelt_df.Tone.apply(lambda x: x[2])
gdelt_df['Polarity'] = gdelt_df.Tone.apply(lambda x: x[3])

In [20]:
gdelt_df.drop(["Tone", "DATE", "SourceCollectionIdentifier", "DocumentIdentifier"], axis = 1, inplace=True)

In [21]:
gdelt_df.sample(10)

Unnamed: 0,GKGRECORDID,Locations,Persons,Organizations,AvgTone,PosScore,NegScore,Polarity
7381,20220413213000-983,"{Nevada, United States, Fish Lake Valley, Nevada, United States, United States, America, Elko, Nevada, United States, American, Esmeralda County, Nevada, United States}",{Tom Lewis},{'lithium': 7},0.414937759336099,2.0746887966805,1.6597510373444,3.7344398340249
2283,20220106221500-1718,"{Oregon, United States, California, United States, Americans, Myrtle Creek, Oregon, United States}","{Russell Anthony, Andrew Aaron}",{'tesla': 1},-4.8,2.4,7.2,9.6
1023,20220104083000-656,"{Xinjiang, Jiangxi, China, China, Beijing, Beijing, China}",,{'tesla': 1},0.0,0.380952380952381,0.380952380952381,0.761904761904762
7989,20220310183000-875,"{Bengaluru, Karnataka, India, China, New York, United States, Shanghai, Shanghai, China}","{Perri Dong, Lewis Krauskopf, Nivedita Balu, Jonathan Oatis, Medha Singh, Tiyashi Datta, Jason Xue}","{'apple': 1, 'tesla': 1}",-1.88679245283019,0.838574423480084,2.72536687631027,3.56394129979036
3640,20211230173000-1111,"{Texas, United States, China, Shanghai, Shanghai, China}",{Tesla Gigafactory},{'tesla': 1},-0.51413881748072,3.59897172236504,4.11311053984576,7.7120822622108
9340,20220314110000-96,"{Hong Kong, China, London, London, City Of, United Kingdom, United Kingdom, Ukraine}",{Elon Musk},{'tesla': 1},-2.38095238095238,2.38095238095238,4.76190476190476,7.14285714285714
9072,20220303193000-739,"{Fremont, California, United States, White House, District Of Columbia, United States, California, United States}","{Brian Rothenberg, Elon Musk, Gene Simmons, Joe Biden}","{'twitter': 2, 'tesla': 1}",-0.367647058823529,3.125,3.49264705882353,6.61764705882353
883,20220130094500-314,{},"{Cathie Wood, Cathie Ark}","{'roku': 1, 'teladoc health': 1, 'tesla': 1}",0.0,2.5830258302583,2.5830258302583,5.16605166051661
1515,20211230094500-2,"{Texas, United States}","{Ted Cruz, Pramila Jayapal, Elon Musk}","{'twitter': 3, 'tesla': 2}",-1.77664974619289,1.77664974619289,3.55329949238579,5.32994923857868
2922,20220103133000-775,"{China, Norway, Germany}",,{'tesla': 1},-1.71428571428571,1.71428571428571,3.42857142857143,5.14285714285714


In [22]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
gdelt_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv


In [23]:
del gdelt_df, securities_df