In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = ['clean_gdelt_data', 'normalize_security_names']


In [2]:
# Parameters
upstream = {
    "clean_gdelt_data": {
        "nb": "/root/market_watch/output/notebooks/clean_gdelt_data.ipynb",
        "data": "/root/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
    },
    "normalize_security_names": {
        "nb": "/root/market_watch/output/notebooks/normalize_security_names.ipynb",
        "data": "/root/market_watch/output/data/interim/normalized_security_names.csv",
    },
}
product = {
    "nb": "/root/market_watch/output/notebooks/total_org_count.ipynb",
    "data": "/root/market_watch/output/data/interim/total_org_counts.csv",
}


In [3]:
import pandas as pd
from collections import Counter
import json
import ast
from pathlib import Path

In [4]:
output_file_path = product['data']
gdelt_file_path = upstream['clean_gdelt_data']['data']
security_file_path = upstream['normalize_security_names']['data']

In [5]:
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
security_df = pd.read_csv(security_file_path, index_col=0)

In [6]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9042 entries, 0 to 10698
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GKGRECORDID    9042 non-null   object 
 1   Locations      9042 non-null   object 
 2   Persons        7460 non-null   object 
 3   Organizations  9042 non-null   object 
 4   AvgTone        9042 non-null   float64
 5   PosScore       9042 non-null   float64
 6   NegScore       9042 non-null   float64
 7   Polarity       9042 non-null   float64
dtypes: float64(4), object(4)
memory usage: 635.8+ KB


In [7]:
security_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9213 entries, 0 to 9212
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cik          9213 non-null   int64 
 1   ticker       9213 non-null   object
 2   full_name    9213 non-null   object
 3   former_name  4477 non-null   object
dtypes: int64(1), object(3)
memory usage: 359.9+ KB


In [8]:
c = Counter()

def update_counter(string):
    if len(string.strip()) > 5:
        string = ast.literal_eval(string)
        string = json.dumps(string)
        dictionary = json.loads(string)
        c.update(dictionary)
    
gdelt_df['Organizations'].apply(update_counter)

0        None
1        None
2        None
6        None
7        None
         ... 
10694    None
10695    None
10696    None
10697    None
10698    None
Name: Organizations, Length: 9042, dtype: object

In [9]:
total_org_count_df = pd.DataFrame.from_dict(dict(c), orient='index', columns=['count'])

In [10]:
full_name_lst = security_df.full_name.to_list()
cik_lst = []
for security in total_org_count_df.index:
    if security in full_name_lst:
        matching_rows = security_df[security_df['full_name'] == security]
        # if len(matching_rows > 0):
        cik_lst.append(matching_rows['ticker'].to_list()[0])

In [11]:
assert len(cik_lst) == len(total_org_count_df.index)

In [12]:
total_org_count_df['ticker'] = cik_lst

In [13]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
total_org_count_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /root/market_watch/output/data/interim/total_org_counts.csv
