In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None

In [2]:
# Parameters
query_params = {
    "rolling_window": 120,
    "search_term": "tesla",
    "bq_table_name": "`gdelt-bq.gdeltv2.gkg`",
}
product = {
    "nb": "/root/market_watch/output/notebooks/fetch_n_filter_gdelt_bq.ipynb",
    "data": "/root/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv",
}


In [3]:
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from google.cloud import bigquery
from tqdm import tqdm
from src import utils

import re
import collections
import pandas as pd
import datetime
import json
import os
import sys
import warnings

warnings.filterwarnings('ignore')

In [4]:
load_dotenv(find_dotenv('market_watch.env'))

True

Load Google API key. This will enable us to call GDELT's BigQuery Dataset

In [5]:
# debug code to see on which env our executable is running
print(sys.executable)
api_key_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
# debug code to see if we have loaded Google API key
print(api_key_file)

/usr/local/bin/python
/root/market_watch/../.ssh/vgaurav-4d0e95d3663a.json


In [6]:
def build_gdelt_query(table_name, search_term, start_date):
    query_string = f"""
        SELECT
          GKGRECORDID,
          DATE,
          SourceCollectionIdentifier,
          DocumentIdentifier,
          V2Locations AS Locations,
          V2Persons AS Persons,
          V2Organizations AS Organizations,
          V2Tone AS Tone
        FROM
          {table_name}
        WHERE
          LOWER(Organizations) LIKE "%{search_term}%"
          AND DATE > {start_date}
    """
    return query_string

In [7]:
def fetch_data(bqclient, query_string):
    df = (
        bqclient.query(query_string).result().to_dataframe(
            # Optionally, explicitly request to use the BigQuery Storage API. As of
            # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
            # API is used by default.
            create_bqstorage_client=True,
        )
    )
    return df

In [8]:
rolling_window = query_params["rolling_window"]
table_name = query_params["bq_table_name"]
search_term = query_params["search_term"]
gkg_file_path = product["data"]

In [9]:
start_date = utils.get_start_date(rolling_window)
start_date = utils.gdelt_date_format(start_date)
gkg_query = build_gdelt_query(table_name, search_term, start_date)

In [10]:
client = bigquery.Client()
data_df = fetch_data(client, gkg_query)
print(f"Processed file with {len(data_df)} records")

Processed file with 10700 records


In [11]:
data_df.head()

Unnamed: 0,GKGRECORDID,DATE,SourceCollectionIdentifier,DocumentIdentifier,Locations,Persons,Organizations,Tone
0,20211221054500-326,20211221054500,1,https://www.thehindu.com/sci-tech/technology/r...,"3#Boston, Massachusetts, United States#US#USMA...","Nathaniel Mendell,2375;Igor Sladkov,2004;Nikol...","Exchange Commission,1128;Democratic Party,1732...","-4.69135802469136,1.97530864197531,6.666666666..."
1,20211221054500-436,20211221054500,1,https://news.am/eng/news/678328.html,1#Americans#US#US##39.828175#-98.5795#US#883;1...,"Elon Musk,28;Elon Musk,349","Space Exploration Technologies Corp,586;Twitte...","-0.921658986175115,2.30414746543779,3.22580645..."
2,20211221040000-T864,20211221040000,1,https://tech.ifeng.com/c/8C9OfDFwt8Q,"1#Finland#FI#FI##64#26#FI#242;4#Beijing, Beiji...",,"Tesla Shanghai,2306;Technology Science,4028","-2.9940119760479,2.54491017964072,5.5389221556..."
3,20211221043000-T739,20211221043000,1,https://www.chinatimes.com/realtimenews/202112...,"2#Texas, United States#US#USTX##31.106#-97.647...","Tesla A Tesla,1816","Samsung,7;Samsung,98;Samsung,473;Samsung,527;S...","-1.36054421768708,0.340136054421769,1.70068027..."
4,20211221044500-T834,20211221044500,1,https://digi.china.com/digi/20211221/202112219...,"3#Houston, Texas, United States#US#USTX#TX201#...","Tesla Roewe,58","Technology Tesla,1852","-5.24781341107872,1.45772594752187,6.705539358..."


In [12]:
# Save GKG records
Path(gkg_file_path).parent.mkdir(exist_ok=True, parents=True)
data_df.to_csv(gkg_file_path)
print(f"Saved file {gkg_file_path}")

Saved file /root/market_watch/output/data/raw/gdelt_gkg_bqdata-raw.csv


In [13]:
del data_df