In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None

In [2]:
# Parameters
query_params = {
    "rolling_window": 120,
    "search_term": "tesla",
    "bq_table_name": "`gdelt-bq.gdeltv2.gkg`",
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
}


In [3]:
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from google.cloud import bigquery
from tqdm import tqdm
from src import utils

import re
import collections
import pandas as pd
import datetime
import json
import os
import sys
import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /Users/aiujdm2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aiujdm2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/aiujdm2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
load_dotenv(find_dotenv('market_watch.env'))

True

Load Google API key. This will enable us to call GDELT's BigQuery Dataset

In [5]:
# debug code to see on which env our executable is running
print(sys.executable)
api_key_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
# debug code to see if we have loaded Google API key
print(api_key_file)

/Users/aiujdm2/.local/share/virtualenvs/market_watch-dtlP-L11/bin/python
/Users/aiujdm2/market_watch/../.ssh/vgaurav-4d0e95d3663a.json


In [6]:
def build_gdelt_query(table_name, search_term, start_date):
    query_string = f"""
        SELECT
          GKGRECORDID,
          DATE,
          SourceCollectionIdentifier,
          DocumentIdentifier,
          V2Locations AS Locations,
          V2Persons AS Persons,
          V2Organizations AS Organizations,
          V2Tone AS Tone
        FROM
          {table_name}
        WHERE
          LOWER(Organizations) LIKE "%{search_term}%"
          AND DATE > {start_date}
    """
    return query_string

In [7]:
def fetch_data(bqclient, query_string):
    df = (
        bqclient.query(query_string).result().to_dataframe(
            # Optionally, explicitly request to use the BigQuery Storage API. As of
            # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
            # API is used by default.
            create_bqstorage_client=True,
        )
    )
    return df

In [8]:
rolling_window = query_params["rolling_window"]
table_name = query_params["bq_table_name"]
search_term = query_params["search_term"]
gkg_file_path = product["data"]

In [9]:
start_date = utils.get_start_date(rolling_window)
start_date = utils.gdelt_date_format(start_date)
gkg_query = build_gdelt_query(table_name, search_term, start_date)

In [10]:
client = bigquery.Client()
data_df = fetch_data(client, gkg_query)
print(f"Processed file with {len(data_df)} records")

Processed file with 11174 records


In [11]:
data_df.head()

Unnamed: 0,GKGRECORDID,DATE,SourceCollectionIdentifier,DocumentIdentifier,Locations,Persons,Organizations,Tone
0,20220209144500-T765,20220209144500,1,https://www.latina.pe/noticias/latina-noticias...,1#Americans#US#US##39.828175#-98.5795#US#450;1...,,"Reuters,141;Tesla Inc,153;United States,125;Un...","-5.64516129032258,2.01612903225806,7.661290322..."
1,20220424150000-T799,20220424150000,1,https://news.sina.com.tw/article/20220424/4167...,"4#Hongguang, Heilongjiang, China#CH#CH08#13176...","Black Cat,982;Black Cat,1605;Ben Ben,701","Junior,61;Junior,230;Junior,636;Junior,1529;Ju...","-1.25588697017268,2.04081632653061,3.296703296..."
2,20220424143000-T1021,20220424143000,1,http://www.stcn.com/./xw/sd/202204/t20220424_4...,"4#Hongguang, Heilongjiang, China#CH#CH08#13176...","Ben Ben,818","Junior,72;Junior,233;Junior,753;Junior,1732;Ju...","-1.54929577464789,2.11267605633803,3.661971830..."
3,20220424153000-T487,20220424153000,1,http://www.taringa.net/+offtopic/nicolas-cage-...,,,"Corporation Tesla,737","-1.875,1.875,3.75,5.625,19.375,1.875,141"
4,20220424141500-545,20220424141500,1,https://www.arkansasonline.com/news/2022/apr/2...,"2#Texas, United States#US#USTX##31.106#-97.647...","Yvonne Taylor,4233;Elon Musk,3562;Fred Thiel,6...","Montana Environmental Information Center,4377;...","-1.41043723554302,2.39774330042313,3.808180535..."


In [12]:
# Save GKG records
Path(gkg_file_path).parent.mkdir(exist_ok=True, parents=True)
data_df.to_csv(gkg_file_path)
print(f"Saved file {gkg_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv


In [13]:
del data_df