In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None

In [2]:
# Parameters
query_params = {
    "rolling_window": 120,
    "search_term": "tesla",
    "bq_table_name": "`gdelt-bq.gdeltv2.gkg`",
}
product = {
    "nb": "/Users/aiujdm2/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
    "data": "/Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
}


In [3]:
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from google.cloud import bigquery
from tqdm import tqdm
from src import utils


import re
import collections
import pandas as pd
import datetime
import json
import os
import sys
import warnings

warnings.filterwarnings('ignore')

In [4]:
load_dotenv(find_dotenv('market_watch.env'))

True

Load Google API key. This will enable us to call GDELT's BigQuery Dataset

In [5]:
# debug code to see on which env our executable is running
print(sys.executable)
api_key_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
# debug code to see if we have loaded Google API key
print(api_key_file)

/Users/aiujdm2/.local/share/virtualenvs/market_watch-dtlP-L11/bin/python
/Users/aiujdm2/market_watch/../.ssh/vgaurav-4d0e95d3663a.json


In [6]:
def build_gdelt_query(table_name, search_term, start_date):
    query_string = f"""
        SELECT
          GKGRECORDID,
          DATE,
          SourceCollectionIdentifier,
          DocumentIdentifier,
          V2Locations AS Locations,
          V2Persons AS Persons,
          V2Organizations AS Organizations,
          V2Tone AS Tone
        FROM
          {table_name}
        WHERE
          LOWER(Organizations) LIKE "%{search_term}%"
          AND DATE > {start_date}
    """
    return query_string

In [7]:
def fetch_data(bqclient, query_string):
    df = (
        bqclient.query(query_string).result().to_dataframe(
            # Optionally, explicitly request to use the BigQuery Storage API. As of
            # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
            # API is used by default.
            create_bqstorage_client=True,
        )
    )
    return df

In [8]:
rolling_window = query_params["rolling_window"]
table_name = query_params["bq_table_name"]
search_term = query_params["search_term"]
gkg_file_path = product["data"]


start_date = utils.get_start_date(rolling_window)
start_date = utils.gdelt_date_format(start_date)
gkg_query = build_gdelt_query(table_name, search_term, start_date)

In [9]:
client = bigquery.Client()
data_df = fetch_data(client, gkg_query)
print(f"Processed file with {len(data_df)} records")

Processed file with 10553 records


In [10]:
data_df.head()

Unnamed: 0,GKGRECORDID,DATE,SourceCollectionIdentifier,DocumentIdentifier,Locations,Persons,Organizations,Tone
0,20211219053000-283,20211219053000,1,https://www.moneyweb.co.za/mineweb/the-world-w...,1#Serbian#RB#RB####RB#2122;1#Serbian#RB#RB####...,"Mark Cutifani,5182;Ben Davis,1979;Aleksandar V...","Bloomberg,5709;Lithium Americas Corp,1370;Rio ...","-3.60824742268041,1.54639175257732,5.154639175..."
1,20211219054500-T653,20211219054500,1,http://hk.eastmoney.com/a/202112192218549712.html,1#Chile#CI#CI##-30#-71#CI#9804;1#Chile#CI#CI##...,"Broadcom Si,4001;Ali Baba,11124;Tesla Xiaopeng...","Landing Division,7339;Landing Hong Kong Exchan...","0.317796610169492,2.22457627118644,1.906779661..."
2,20211219064500-103,20211219064500,1,https://www.afr.com/companies/mining/world-wan...,1#Serbian#RB#RB####RB#440;1#Serbian#RB#RB####R...,"Mark Cutifani,3485;Ben Davis,297;Aleksandar Vu...","Bloomberg,4004;Bloomberg,4021;Rio Tinto,1238;T...","-2.84090909090909,1.5625,4.40340909090909,5.96..."
3,20211219090000-141,20211219090000,1,https://www.torquenews.com/comment/reply/43229,"1#Taiwan#TW#TW##24#121#TW#3025;4#Delft, Zuid-H...","Elon Musk,3542;Nico Caballero,4238;Tesla Cyber...","Samsung,42;Samsung,414;Samsung,943;Samsung,226...","1.84453227931489,2.23978919631094,0.3952569169..."
4,20211219101500-T57,20211219101500,1,https://www.tomsguide.fr/le-developpement-de-g...,1#Germany#GM#GM##51.5#10.5#GM#1991;1#American#...,"Willem Dafoe,558;Willem Dafoe,2947;Willem Dafo...","Forum Of Tesla,1598;Klipsch,4578;Klipsch,4745;...","1.06295993458708,4.82420278004906,3.7612428454..."


In [11]:
# Save GKG records
Path(gkg_file_path).parent.mkdir(exist_ok=True, parents=True)
data_df.to_csv(gkg_file_path)
print(f"Saved file {gkg_file_path}")

Saved file /Users/aiujdm2/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv


In [12]:
del data_df