In [1]:
# If this task has dependencies, list them them here
# (e.g. upstream = ['some_task']), otherwise leave as None.
upstream = ['clean_gdelt_data', 'total_org_count']

# This is a placeholder, leave it as None
product = None

In [2]:
# Parameters
upstream = {
    "clean_gdelt_data": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/clean_gdelt_data.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
    },
    "total_org_count": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/total_org_count.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/total_org_counts.csv",
    },
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/create_tfidf.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/features/tfidf_vector.csv",
}


In [3]:
import pandas as pd
import numpy as np
import warnings
import json
import ast
from pathlib import Path

warnings.simplefilter("ignore")

In [4]:
gdelt_file_path = upstream['clean_gdelt_data']['data']
total_count_path = upstream['total_org_count']['data']

In [5]:
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
total_org_count_df = pd.read_csv(total_count_path, index_col=0)

In [6]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8912 entries, 0 to 10551
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GKGRECORDID    8912 non-null   object 
 1   Locations      8912 non-null   object 
 2   Persons        7385 non-null   object 
 3   Organizations  8912 non-null   object 
 4   AvgTone        8912 non-null   float64
 5   PosScore       8912 non-null   float64
 6   NegScore       8912 non-null   float64
 7   Polarity       8912 non-null   float64
dtypes: float64(4), object(4)
memory usage: 626.6+ KB


In [7]:
total_org_count_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 395 entries, lithium americas to codexis
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   count   395 non-null    int64 
 1   ticker  395 non-null    object
dtypes: int64(1), object(1)
memory usage: 9.3+ KB


In [8]:
def convert_to_dict(string):
    string = ast.literal_eval(string)
    string = json.dumps(string)
    dictionary = json.loads(string)
    return pd.Series(dictionary)

tf_df = gdelt_df['Organizations'].apply(convert_to_dict)
ticker_name_dict = {values['index']: values['ticker'] for row_num, values in total_org_count_df.reset_index().iterrows()}
tf_df.rename(columns=ticker_name_dict, inplace = True)

In [9]:
tf_df = tf_df.div(tf_df.sum(axis=1), axis=0)
tf_df

Unnamed: 0,LAC,RIO,SCCO,TSLA,AVGO,TWTR,FB,AAPL,ARMK,HUBG,...,OEG,KEGS,LAB,CYRN,HRL,KSPN,PRPO,MPLN,LULU,CDXS
0,0.2,0.4,0.2,0.20,,,,,,,...,,,,,,,,,,
1,,,,,1.0,,,,,,...,,,,,,,,,,
2,,0.5,,0.50,,,,,,,...,,,,,,,,,,
3,,,,1.00,,,,,,,...,,,,,,,,,,
6,,,,1.00,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10547,,,,1.00,,,,,,,...,,,,,,,,,,
10548,,,,0.25,,,,,,,...,,,,,,,,,,
10549,,,,1.00,,,,,,,...,,,,,,,,,,
10550,,,,1.00,,,,,,,...,,,,,,,,,,


In [10]:
idf = np.log(len(tf_df) / tf_df.count(axis=0))
idf

LAC     6.322565
RIO     4.629246
SCCO    6.792569
TSLA    0.107957
AVGO    6.261941
          ...   
KSPN    9.095154
PRPO    9.095154
MPLN    9.095154
LULU    8.402007
CDXS    9.095154
Length: 395, dtype: float64

In [11]:
tf_idf_df = tf_df * idf
tf_idf_df = tf_idf_df.fillna(0)
tf_idf_df

Unnamed: 0,LAC,RIO,SCCO,TSLA,AVGO,TWTR,FB,AAPL,ARMK,HUBG,...,OEG,KEGS,LAB,CYRN,HRL,KSPN,PRPO,MPLN,LULU,CDXS
0,1.264513,1.851698,1.358514,0.021591,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,6.261941,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,2.314623,0.000000,0.053979,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.107957,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.000000,0.000000,0.107957,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10547,0.000000,0.000000,0.000000,0.107957,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10548,0.000000,0.000000,0.000000,0.026989,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10549,0.000000,0.000000,0.000000,0.107957,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10550,0.000000,0.000000,0.000000,0.107957,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
tf_idf_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/features/tfidf_vector.csv


In [13]:
del tf_df, tf_idf_df