In [1]:
# If this task has dependencies, list them them here
# (e.g. upstream = ['some_task']), otherwise leave as None.
upstream = ['clean_gdelt_data', 'total_org_count']

# This is a placeholder, leave it as None
product = None

In [2]:
# Parameters
upstream = {
    "clean_gdelt_data": {
        "nb": "/root/market_watch/output/notebooks/clean_gdelt_data.ipynb",
        "data": "/root/market_watch/output/data/interim/gdelt_gkg_data-cleaned.csv",
    },
    "total_org_count": {
        "nb": "/root/market_watch/output/notebooks/total_org_count.ipynb",
        "data": "/root/market_watch/output/data/interim/total_org_counts.csv",
    },
}
product = {
    "nb": "/root/market_watch/output/notebooks/create_tfidf.ipynb",
    "data": "/root/market_watch/output/data/features/tfidf_vector.csv",
}


In [3]:
import pandas as pd
import numpy as np
import warnings
import json
import ast
from pathlib import Path

warnings.simplefilter("ignore")

In [4]:
gdelt_file_path = upstream['clean_gdelt_data']['data']
total_count_path = upstream['total_org_count']['data']

In [5]:
gdelt_df = pd.read_csv(gdelt_file_path, index_col=0)
total_org_count_df = pd.read_csv(total_count_path, index_col=0)

In [6]:
gdelt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9042 entries, 0 to 10698
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GKGRECORDID    9042 non-null   object 
 1   Locations      9042 non-null   object 
 2   Persons        7460 non-null   object 
 3   Organizations  9042 non-null   object 
 4   AvgTone        9042 non-null   float64
 5   PosScore       9042 non-null   float64
 6   NegScore       9042 non-null   float64
 7   Polarity       9042 non-null   float64
dtypes: float64(4), object(4)
memory usage: 635.8+ KB


In [7]:
total_org_count_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 402 entries, tesla to precipio
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   count   402 non-null    int64 
 1   ticker  402 non-null    object
dtypes: int64(1), object(1)
memory usage: 9.4+ KB


In [8]:
def convert_to_dict(string):
    string = ast.literal_eval(string)
    string = json.dumps(string)
    dictionary = json.loads(string)
    return pd.Series(dictionary)

tf_df = gdelt_df['Organizations'].apply(convert_to_dict)
ticker_name_dict = {values['index']: values['ticker'] for row_num, values in total_org_count_df.reset_index().iterrows()}
tf_df.rename(columns=ticker_name_dict, inplace = True)

In [9]:
tf_df = tf_df.div(tf_df.sum(axis=1), axis=0)
tf_df

Unnamed: 0,TSLA,TWTR,ARMK,HUBG,GPI,LCID,BOX,GTY,SONY,DIS,...,CFG,LIN,SIRI,NLOK,CYRN,UNM,LEE,HRL,KSPN,PRPO
0,1.000000,,,,,,,,,,...,,,,,,,,,,
1,0.500000,0.500000,,,,,,,,,...,,,,,,,,,,
2,1.000000,,,,,,,,,,...,,,,,,,,,,
6,1.000000,,,,,,,,,,...,,,,,,,,,,
7,1.000000,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10694,0.250000,,,,,,,,,,...,,,,,,,,,,
10695,1.000000,,,,,,,,,,...,,,,,,,,,,
10696,1.000000,,,,,,,,,,...,,,,,,,,,,
10697,1.000000,,,,,,,,,,...,,,,,,,,,,


In [10]:
idf = np.log(len(tf_df) / tf_df.count(axis=0))
idf

TSLA    0.108413
TWTR    1.401674
ARMK    8.416488
HUBG    9.109636
GPI     7.163726
          ...   
UNM     9.109636
LEE     9.109636
HRL     9.109636
KSPN    9.109636
PRPO    9.109636
Length: 402, dtype: float64

In [11]:
tf_idf_df = tf_df * idf
tf_idf_df = tf_idf_df.fillna(0)
tf_idf_df

Unnamed: 0,TSLA,TWTR,ARMK,HUBG,GPI,LCID,BOX,GTY,SONY,DIS,...,CFG,LIN,SIRI,NLOK,CYRN,UNM,LEE,HRL,KSPN,PRPO
0,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.054206,0.700837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10694,0.027103,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10695,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10696,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10697,0.108413,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
output_file_path = product['data']
Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
tf_idf_df.to_csv(output_file_path)
print(f"Saved file {output_file_path}")

Saved file /root/market_watch/output/data/features/tfidf_vector.csv


In [13]:
del tf_df, tf_idf_df