In [1]:
# If this task has dependencies, list them them here
# (e.g. upstream = ['some_task']), otherwise leave as None.
upstream = ['clean_gdelt_data', 'total_org_count']

# This is a placeholder, leave it as None
product = None

In [2]:
# Parameters
upstream = {
    "clean_gdelt_data": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/clean_gdelt_data.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
    },
    "total_org_count": {
        "nb": "/Users/aiujdm2/market_watch/output/notebooks/total_org_count.ipynb",
        "data": "/Users/aiujdm2/market_watch/output/data/interim/total_org_counts.csv",
    },
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/create_tfidf.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/features/tfidf_vector.csv",
}


In [3]:
import pandas as pd
import numpy as np
import warnings
import json
import ast
from pathlib import Path

warnings.simplefilter("ignore")

In [4]:
gdelt_file_path = upstream['clean_gdelt_data']['data']
total_count_path = upstream['total_org_count']['data']

In [5]:
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
total_org_count_df = pd.read_csv(total_count_path, index_col=0)

In [6]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9534 entries, 0 to 11173
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GKGRECORDID    9534 non-null   object 
 1   Locations      9534 non-null   object 
 2   Persons        7953 non-null   object 
 3   Organizations  9534 non-null   object 
 4   AvgTone        9534 non-null   float64
 5   PosScore       9534 non-null   float64
 6   NegScore       9534 non-null   float64
 7   Polarity       9534 non-null   float64
dtypes: float64(4), object(4)
memory usage: 670.4+ KB


In [7]:
total_org_count_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, tesla to lincoln electric
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   count   396 non-null    int64 
 1   ticker  396 non-null    object
dtypes: int64(1), object(1)
memory usage: 9.3+ KB


In [8]:
def convert_to_dict(string):
    string = ast.literal_eval(string)
    string = json.dumps(string)
    dictionary = json.loads(string)
    return pd.Series(dictionary)

tf_df = gdelt_df['Organizations'].apply(convert_to_dict)
ticker_name_dict = {values['index']: values['ticker'] for row_num, values in total_org_count_df.reset_index().iterrows()}
tf_df.rename(columns=ticker_name_dict, inplace = True)

In [9]:
tf_df = tf_df.div(tf_df.sum(axis=1), axis=0)
tf_df

Unnamed: 0,TSLA,MARA,TDOC,ROKU,NFLX,MS,NDAQ,CCL,GME,MRK,...,EA,D,ECL,HCAT,HQY,ADSK,AIG,CYRN,VTRS,LECO
0,1.00,,,,,,,,,,...,,,,,,,,,,
3,1.00,,,,,,,,,,...,,,,,,,,,,
4,0.50,0.5,,,,,,,,,...,,,,,,,,,,
5,0.25,,0.25,0.25,0.25,,,,,,...,,,,,,,,,,
6,1.00,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11167,1.00,,,,,,,,,,...,,,,,,,,,,
11169,1.00,,,,,,,,,,...,,,,,,,,,,
11170,1.00,,,,,,,,,,...,,,,,,,,,,
11172,1.00,,,,,,,,,,...,,,,,,,,,,


In [10]:
idf = np.log(len(tf_df) / tf_df.count(axis=0))
idf

TSLA    0.112918
MARA    4.257345
TDOC    5.830415
ROKU    5.696884
NFLX    3.538602
          ...   
ADSK    9.162620
AIG     9.162620
CYRN    9.162620
VTRS    9.162620
LECO    9.162620
Length: 396, dtype: float64

In [11]:
tf_idf_df = tf_df * idf
tf_idf_df = tf_idf_df.fillna(0)
tf_idf_df

Unnamed: 0,TSLA,MARA,TDOC,ROKU,NFLX,MS,NDAQ,CCL,GME,MRK,...,EA,D,ECL,HCAT,HQY,ADSK,AIG,CYRN,VTRS,LECO
0,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.056459,2.128672,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.028229,0.000000,1.457604,1.424221,0.884651,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11167,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11169,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11170,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11172,0.112918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
tf_idf_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/features/tfidf_vector.csv


In [13]:
del tf_df, tf_idf_df