In [1]:
# your parameters here...
upstream = ['fetch_n_filter_gdelt_bq', 'normalize_security_names']

In [2]:
# Parameters
upstream = {
    "fetch_n_filter_gdelt_bq": {
        "nb": "/root/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
        "data": "/root/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
    },
    "normalize_security_names": {
        "nb": "/root/market_watch/output/notebooks/normalize_security_names.ipynb",
        "data": "/root/market_watch/output/data/interim/normalized_security_names.csv",
    },
}
product = {
    "nb": "/root/market_watch/output/notebooks/clean_gdelt_data.ipynb",
    "data": "/root/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
}


In [3]:
# your code here...
import pandas as pd
import numpy as np
from pathlib import Path
import nltk
import time
import re
from time import time
import json
import ast
from src.utils import preprocess_text
from collections import Counter
from src.utils import fuzz_similarity, preprocess_text
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
stops = set(stopwords.words('english'))
pd.options.display.max_colwidth = 200

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
gdelt_file_path = upstream['fetch_n_filter_gdelt_bq']['data']
securities_file_path = upstream['normalize_security_names']['data']

In [6]:
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
securities_df = pd.read_csv(securities_file_path, index_col=0)
securities_df.dropna(subset=['former_name'], inplace=True)
securities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4477 entries, 1 to 9211
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cik          4477 non-null   int64 
 1   ticker       4477 non-null   object
 2   full_name    4477 non-null   object
 3   former_name  4477 non-null   object
dtypes: int64(1), object(3)
memory usage: 174.9+ KB


In [7]:
gdelt_df.columns

Index(['GKGRECORDID', 'DATE', 'SourceCollectionIdentifier',
       'DocumentIdentifier', 'Locations', 'Persons', 'Organizations', 'Tone'],
      dtype='object')

In [8]:
securities_df.head()

Unnamed: 0,cik,ticker,full_name,former_name
1,1471727,BTTR,better choice,sport endurance
4,1513818,ARAV,aravive,versartis
5,1320854,RAIL,freightcar america,fca acquisition
8,1826000,LTCH,latch,ts innovation acquisitions
9,1707919,CENN,cenntro electric,naked brand


In [9]:
#  drop rows where no organizations names were extracted
gdelt_df = gdelt_df.dropna(subset=['Organizations'])
gdelt_df = gdelt_df.replace(to_replace= ['Google', 'Facebook', 'YouTube', 'Youtube'], value=['alphabet', 'meta platforms', 'alphabet', 'alphabet'], regex=True)

In [10]:
#  get rid of numeric position of org name mention by only extracting alpha names
# Extract only names not the index
gdelt_df.Organizations = gdelt_df.Organizations.map(lambda x: re.split(r',\d+;?', x))

In [11]:
def preprocess_orgs(x):
    return preprocess_text(x, eng=True)

gdelt_df.Organizations = gdelt_df.Organizations.apply(preprocess_orgs)

In [12]:
gdelt_df.Organizations = gdelt_df.Organizations.replace(to_replace=securities_df.former_name.to_list(), value=securities_df.full_name.to_list())

In [13]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10666 entries, 0 to 10699
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 10666 non-null  object
 1   DATE                        10666 non-null  int64 
 2   SourceCollectionIdentifier  10666 non-null  int64 
 3   DocumentIdentifier          10666 non-null  object
 4   Locations                   9098 non-null   object
 5   Persons                     8518 non-null   object
 6   Organizations               10666 non-null  object
 7   Tone                        10666 non-null  object
dtypes: int64(2), object(6)
memory usage: 750.0+ KB


In [14]:
def count_orgs(x):
    return x if type(x) is float or x is None or len(x) == 0 else Counter(x)

gdelt_df.Organizations = gdelt_df.Organizations.apply(count_orgs)
gdelt_df.dropna(subset=['Organizations'], inplace=True)
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10666 entries, 0 to 10699
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 10666 non-null  object
 1   DATE                        10666 non-null  int64 
 2   SourceCollectionIdentifier  10666 non-null  int64 
 3   DocumentIdentifier          10666 non-null  object
 4   Locations                   9098 non-null   object
 5   Persons                     8518 non-null   object
 6   Organizations               10666 non-null  object
 7   Tone                        10666 non-null  object
dtypes: int64(2), object(6)
memory usage: 750.0+ KB


In [15]:
securities_names = preprocess_text(securities_df.full_name)
gdelt_df.Organizations = gdelt_df.Organizations.apply(lambda x: { key:x[key] for key in x.keys() if key in securities_names and key != 'Tooshorttext'})
gdelt_df = gdelt_df[gdelt_df.Organizations.str.len() != 0]

In [16]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9042 entries, 0 to 10698
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   GKGRECORDID                 9042 non-null   object
 1   DATE                        9042 non-null   int64 
 2   SourceCollectionIdentifier  9042 non-null   int64 
 3   DocumentIdentifier          9042 non-null   object
 4   Locations                   7618 non-null   object
 5   Persons                     7460 non-null   object
 6   Organizations               9042 non-null   object
 7   Tone                        9042 non-null   object
dtypes: int64(2), object(6)
memory usage: 635.8+ KB


In [17]:
def split_locations(location_list):
    location_names = []
    if type(location_list) is not float:
        for location_string in location_list:
            loc_parts = location_string.split('#')
            # We are only interested in full location name which is second entry in location string           
            location_names.append(loc_parts[1]) if len(loc_parts) > 1 else np.nan
            
    return location_names
# Locations are semi-colons(;) seperated and each location string is further seperated by hash(#)
gdelt_df.Locations = gdelt_df.Locations.str.split(';').apply(split_locations)
#  Remove duplicates
gdelt_df.Locations = gdelt_df.Locations.map(set)

In [18]:
#  Clean some data elements
gdelt_df.Persons = gdelt_df.Persons.str.findall(pat="[A-Z][a-z]+ [A-Z][a-z]+")
#  Remove duplicates
gdelt_df.Persons = gdelt_df.Persons.map(set, na_action='ignore')

In [19]:
gdelt_df.Tone = gdelt_df.Tone.str.split(',')

In [20]:
# Clean Tone
gdelt_df['AvgTone'] = gdelt_df.Tone.apply(lambda x: x[0])
gdelt_df['PosScore'] = gdelt_df.Tone.apply(lambda x: x[1])
gdelt_df['NegScore'] = gdelt_df.Tone.apply(lambda x: x[2])
gdelt_df['Polarity'] = gdelt_df.Tone.apply(lambda x: x[3])

In [21]:
gdelt_df.drop(["Tone", "DATE", "SourceCollectionIdentifier", "DocumentIdentifier"], axis = 1, inplace=True)

In [22]:
gdelt_df.sample(10)

Unnamed: 0,GKGRECORDID,Locations,Persons,Organizations,AvgTone,PosScore,NegScore,Polarity
8182,20220218174500-977,"{Japan, Seoul, Soul-T'ukpyolsi, South Korea, Korea, South Korea, Tokyo, Tokyo, Japan}",,{'tesla': 2},0.0,3.16091954022989,3.16091954022989,6.32183908045977
1623,20220104121500-127,"{Americans, Taiwan, Xinjiang, Jiangxi, China, Beijing, Beijing, China, United States, Massachusetts, United States, Wuhan, Hubei, China, American, Boston, Massachusetts, United States, Salamanca, ...","{Yorker Trisha, Steve Jobs, Liz Wagner, Elizabeth Holmes, Yogananda Pittman, Kent Albright, Elon Musk, Eric Adams, Ibrahim Hooper, Qingyuani Sayno, Justin Voldman, Sokhary Chau, Chuck Wexler}","{'tesla': 1, 'meta platforms': 1, 'netflix': 1, 'johnson johnson': 1}",-3.90731485688323,1.99909132212631,5.90640617900954,7.90549750113585
460,20211224050000-231,"{United Kingdom, Japan, Norway, Poland, Australia, Netherlands, Germany, Iceland, China, America, Singapore}",{Jesse Maida},{'tesla': 3},1.08147080028839,2.45133381398702,1.36986301369863,3.82119682768565
4691,20220218001500-T771,"{United States, Australia, Shenzhen, Guangdong, China, Changsha, Hunan, China, Changan, Shaanxi, China, Guangdong, Jilin, China, China, New Market, Queensland, Australia, Shanghai, Shanghai, China...","{Li Jie, Jin Ji}",{'tesla': 1},-1.13924050632911,1.9620253164557,3.10126582278481,5.06329113924051
10225,20220413130000-876,"{New York, United States, Russia, Highpeak, New York, United States, Russian, China, Ukraine}","{Jim Cramer, Jeff Lawson}","{'tesla': 1, 'devon': 3, 'nortonlifelock': 1, 'coterra': 3}",0.592300098716683,2.36920039486673,1.77690029615005,4.14610069101678
2792,20220127060000-129,{},"{Tesla Cybertruck, Henry Singleton}",{'tesla': 1},0.247524752475247,1.98019801980198,1.73267326732673,3.71287128712871
1068,20211231153000-839,{},,{'tesla': 2},-0.619195046439628,4.95356037151703,5.57275541795666,10.5263157894737
10388,20220328051500-850,{},{Elon Musk},"{'twitter': 6, 'tesla': 1}",0.757575757575757,3.40909090909091,2.65151515151515,6.06060606060606
3020,20220106180000-1599,"{Austria, Japanese, French, Hungarian, Canadian, German, South Korea, Tokyo, Tokyo, Japan}","{Kenichiro Yoshida, Las Vegas, James Dyson, Takaki Nakanishi}","{'thomson reuters': 1, 'tesla': 1, 'panasonic': 3, 'sony': 9, 'apple': 1}",-0.595238095238095,2.5297619047619,3.125,5.65476190476191
6969,20220216161500-1176,{},{Elon Musk},{'tesla': 1},-3.48525469168901,2.14477211796247,5.63002680965147,7.77479892761394


In [23]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
gdelt_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /root/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv


In [24]:
del gdelt_df, securities_df