In [84]:
import numpy as np
import pandas as pd
import os, requests, zipfile, warnings, time, random
from io import BytesIO
import datetime
import tqdm
import concurrent.futures
from threading import Lock
warnings.filterwarnings('ignore')

In [2]:
test_url = 'http://data.gdeltproject.org/gdeltv2/20250219000000.gkg.csv.zip'
base_url = 'http://data.gdeltproject.org/gdeltv2/'

In [None]:
# GCAM sentiment variable dictionary
doc = pd.read_csv(r"..\Data\GCAM-MASTER-CODEBOOK.TXT",
        delimiter='\t',
        encoding='latin',
        low_memory=False,
        usecols=['Variable', 'Type', 'DimensionHumanName'],
    )
doc['new_name'] = doc['Variable'] + '; ' + doc['Type'] + '; ' + doc['DimensionHumanName']
doc_dict = doc.to_dict(orient='list')

In [3]:
def parse_data(data):
    df = pd.read_csv(
        data,
        delimiter='\t',
        encoding='latin',
        low_memory=False,
        header=None,
        usecols=[0, 1, 2, 3, 4, 7, 9, 13, 15, 17, 25, 26],
        names=[
            'GKGRECORDID',
            'V2.1DATE',
            'V2SOURCECOLLECTIONIDENTIFIER',
            'V2SOURCECOMMONNAME',
            'V2DOCUMENTIDENTIFIER',
            'V1THEMES',
            'V1LOCATIONS',
            'V1ORGANIZATIONS',
            'V1.5TONE',
            'V2GCAM',
            'V2.1TRANSLATIONINFO',
            'V2EXTRASXML'
        ]
    )
    return(df)

In [4]:
def download_unzip(url):
    """
    Downloads a zipped CSV file from the given URL, extracts the first file in the archive,
    and parses it into a pandas DataFrame using the parse_data function.

    Args:
        url (str): The URL to the .zip file containing the CSV.

    Returns:
        tuple: (DataFrame, status_code)
            DataFrame: The parsed data as a pandas DataFrame.
            status_code: The HTTP status code from the download request.
    """
    response = requests.get(url)
    status_code = response.status_code
    
    # Check if request was successful before proceeding
    if status_code != 200:
        return None, status_code
    
    else:
        # Create a BytesIO object from the downloaded content
        zip_bytes = BytesIO(response.content)

        # Open the zip file in memory
        with zipfile.ZipFile(zip_bytes, 'r') as zip_file:
            # Get the first file in the zip
            first_file = zip_file.namelist()[0]
            
            # Read the file content directly into memory
            with zip_file.open(first_file) as file:
                return(parse_data(file), status_code)

In [96]:
df,code=download_unzip(test_url)
print(code)

200


In [95]:
# testing a bad link
df, code = download_unzip('http://data.gdeltproject.org/gdeltv2/2029999.gkg.csv.zip')
print(code)

404


In [5]:
def generate_url_list(start_dt, end_dt, increment_minutes=15):
    url_list = []
    current_dt = start_dt
    
    while current_dt <= end_dt:
        # Format as YYYYMMDDHHMMSS
        datetime_str = current_dt.strftime('%Y%m%d%H%M%S')
        url_list.append(f'{base_url}{datetime_str}.gkg.csv.zip')
        
        # Add increment
        current_dt += datetime.timedelta(minutes=increment_minutes)
    
    return url_list

In [47]:
start = datetime.datetime(2018, 1, 1, 0, 0, 0)
end = datetime.datetime(2025, 5, 1, 0, 0, 0)
urls = generate_url_list(start, end, 15)
print(urls[0:10])

['http://data.gdeltproject.org/gdeltv2/20180101000000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101001500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101003000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101004500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101010000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101011500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101013000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101014500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101020000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101021500.gkg.csv.zip']


In [6]:
# function to chunk datetime_strings into groups of 100
def chunk_datetime_strings(urls, chunk_size=100):
    return [urls[i:i + chunk_size] for i in range(0, len(urls), chunk_size)]

In [22]:
def download_chunk_parallel(chunk, max_workers=10):
    """
    Downloads and processes multiple GDELT CSV files in parallel using threading.
    
    This function downloads zipped CSV files from URLs, extracts and parses them into
    pandas DataFrames, and combines them into a single DataFrame. It includes error
    handling, logging of failed downloads, and automatic stopping if too many
    consecutive downloads fail.
    
    Args:
        chunk (list): List of URLs pointing to .zip files containing CSV data.
        max_workers (int, optional): Maximum number of concurrent threads for downloading.
            Defaults to 10.
    
    Returns:
        pandas.DataFrame or None: Combined DataFrame containing data from all successful
            downloads, or None if no downloads succeeded.
    
    Features:
        - Parallel downloading using ThreadPoolExecutor
        - Thread-safe logging of failed downloads to timestamped log file
        - Progress bar showing download status
        - Automatic stopping if last 10 consecutive downloads fail
        - Error handling for network issues and parsing problems
    
    Error Handling:
        - HTTP errors (non-200 status codes) are logged
        - Network exceptions are caught and logged
        - Failed downloads are written to 'failed_downloads_YYYYMMDD_HHMMSS.txt'
        - Function stops early if 10 consecutive downloads fail to prevent wasted resources
    
    Note:
        Uses the existing download_unzip() and parse_data() functions for individual file processing.
    """
    df_list = []
    response_codes = []
    current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f"../logs/failed_downloads_{current_time}.txt"
    lock = Lock()  # For thread-safe file writing
    
    def download_single_url(url):
        try:
            df, code = download_unzip(url)
            
            # Thread-safe logging of response codes
            with lock:
                response_codes.append(code)
                
                if code != 200:
                    with open(log_file, "a") as f:
                        f.write(f"{url} - Status Code: {code}\n")
                else:
                    return df
                    
                # Check last 10 response codes
                if len(response_codes) >= 10 and all(c != 200 for c in response_codes[-10:]):
                    raise Exception("Error: Last 10 downloads failed (status code != 200). Stopping.")

        except Exception as e:
            with lock:
                with open(log_file, "a") as f:
                    f.write(f"{url} - Error: {str(e)}\n")
            print(f"Failed to download or parse {url}: {str(e)}")
            return None

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all download tasks
        future_to_url = {executor.submit(download_single_url, url): url for url in chunk}
        
        # Collect results with progress bar
        for future in tqdm.tqdm(concurrent.futures.as_completed(future_to_url), 
                               total=len(chunk), desc="Downloading", unit="file"):
            try:
                result = future.result()
                if result is not None:
                    df_list.append(result)
            except Exception as e:
                print(f"Error in thread execution: {str(e)}")
                break

            time.sleep(0.1)
    
    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        print(f"None of the URLs worked.")
        return None

In [100]:
test=urls[0:10]
df=download_chunk_parallel(test, max_workers=2)
df

Downloading: 100%|██████████| 10/10 [00:03<00:00,  2.82file/s]


Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
0,20180101000000-0,20180101000000,1,stamfordadvocate.com,http://www.stamfordadvocate.com/news/politics/...,TAX_FNCACT;TAX_FNCACT_CHIEF;WB_696_PUBLIC_SECT...,"2#Florida, United States#US#USFL#27.8333#-81.7...",,"-3.64583333333333,3.38541666666667,7.03125,10....","wc:350,c12.1:42,c12.10:33,c12.12:8,c12.13:13,c...",,<PAGE_PRECISEPUBTIMESTAMP>20171231231600</PAGE...
1,20180101000000-1,20180101000000,1,thestar.com.my,https://www.thestar.com.my/business/business-n...,,1#Malaysia#MY#MY#2.5#112.5#MY;1#Japan#JA#JA#36...,public investment bank;zhejiang geely holding ...,"1.51098901098901,2.47252747252747,0.9615384615...","wc:671,c1.2:9,c1.3:1,c12.1:18,c12.10:59,c12.11...",,<PAGE_LINKS>https://www.thestar.com.my/Search<...
2,20180101000000-2,20180101000000,1,radaronline.com,https://radaronline.com/celebrity-news/bow-wow...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;TAX_WORLD...,1#United States#US#US#39.828175#-98.5795#US,,"4.66101694915254,5.50847457627119,0.8474576271...","wc:206,c12.1:31,c12.10:21,c12.12:6,c12.13:8,c1...",,<PAGE_LINKS>http://radaronline.com/tag/bow-wow...
3,20180101000000-3,20180101000000,1,sanantoniopost.com,http://www.sanantoniopost.com/news/255927392/g...,,"3#Houston, Texas, United States#US#USTX#29.763...",,"-0.704225352112676,1.93661971830986,2.64084507...","wc:527,c12.1:24,c12.10:43,c12.12:19,c12.13:7,c...",,<PAGE_LINKS>http://www.sanantoniopost.com/revi...
4,20180101000000-4,20180101000000,1,complex.com,http://www.complex.com/life/2017/12/dinosaur-e...,,"1#China#CH#CH#35#105#CH;4#Ganzhou, Jiangxi, Ch...",,"-0.495049504950495,1.48514851485149,1.98019801...","wc:187,c1.3:1,c12.1:9,c12.10:10,c12.12:4,c12.1...",,<PAGE_LINKS>http://www.cnn.com/2015/04/21/asia...
...,...,...,...,...,...,...,...,...,...,...,...,...
13853,20180101021500-1303,20180101021500,1,nintendo-insider.com,https://www.nintendo-insider.com/farming-simul...,,,giants software,"1.40646976090014,4.36005625879044,2.9535864978...","wc:667,c1.2:1,c1.3:1,c12.1:59,c12.10:70,c12.11...",,<PAGE_LINKS>https://www.nintendo-insider.com/f...
13854,20180101021500-1304,20180101021500,1,onlinenigeria.com,https://news2.onlinenigeria.com/news/general/6...,TAX_FNCACT;TAX_FNCACT_HOUSEMAID;TAX_FNCACT_WOM...,"1#Nigeria#NI#NI#10#8#NI;4#Joro, Borno, Nigeria...",,"-1.33779264214047,2.67558528428094,4.013377926...","wc:268,c12.1:16,c12.10:22,c12.12:13,c12.13:6,c...",,<PAGE_LINKS>https://news2.onlinenigeria.com/;h...
13855,20180101021500-1305,20180101021500,1,indiatimes.com,https://economictimes.indiatimes.com/news/inte...,RAPE;USPEC_POLICY1;TAX_FNCACT;TAX_FNCACT_CHIEF...,"2#Florida, United States#US#USFL#27.8333#-81.7...",,"-3.8265306122449,3.31632653061224,7.1428571428...","wc:356,c12.1:43,c12.10:33,c12.12:8,c12.13:13,c...",,<PAGE_LINKS>https://m.economictimes.com/topic/...
13856,20180101021500-1306,20180101021500,1,waateanews.com,https://www.waateanews.com/waateanews/x_news/M...,EDUCATION;ELECTION;,"4#Mangere, New Zealand (General), New Zealand#...",family support services mangere labour;televis...,"1.66666666666667,4.83333333333333,3.1666666666...","wc:551,c1.3:1,c12.1:34,c12.10:37,c12.12:12,c12...",,<PAGE_LINKS>http://www.waateanews.com;http://w...


In [101]:
df.columns

Index(['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER',
       'V2SOURCECOMMONNAME', 'V2DOCUMENTIDENTIFIER', 'V1THEMES', 'V1LOCATIONS',
       'V1ORGANIZATIONS', 'V1.5TONE', 'V2GCAM', 'V2.1TRANSLATIONINFO',
       'V2EXTRASXML'],
      dtype='object')

In [None]:
def first_pass_clean(df):
    # Limit to include United States
    df = df[df['V1LOCATIONS'].str.contains('united states', case=False, na=False)]

    # Limit to include airlines
    df = df[df['V1ORGANIZATIONS'].str.contains(
        "Alaska Airlines|American Airlines|Delta Air Lines|Frontier Airlines|Hawaiian Airlines|JetBlue|Southwest Airlines|Spirit Airlines|Sun Country Airlines|United Airlines|Allegiant Air"
        , case=False, na=False, regex=True)]
    
    # Drop if missing fields
    df = df.dropna(
        subset=['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER',
       'V2DOCUMENTIDENTIFIER', 'V1LOCATIONS',
       'V1ORGANIZATIONS', 'V1.5TONE', 'V2GCAM']
        )
    
    # reset the index
    df = df.reset_index(drop=True)

    return(df)

In [94]:
def second_pass_clean(df):
    df['V2SOURCECOLLECTIONIDENTIFIER'] = df['V2SOURCECOLLECTIONIDENTIFIER'].astype(np.int8)

    # dates
    df['datetime'] = pd.to_datetime(df['V2.1DATE'], format='%Y%m%d%H%M%S', errors='coerce')
    df['date'] = df['datetime'].dt.date

    # company Dummies
    to_check= ["airplane","airline","airport","Alaska Airlines","American Airlines","Delta Air Lines","Frontier Airlines","Hawaiian Airlines","JetBlue","Southwest Airlines","Spirit Airlines","Sun Country Airlines","United Airlines","Allegiant Air"]
    for word in to_check:
        df[word] = df['V1ORGANIZATIONS'].str.contains(word, case=False, na=False).astype(np.int8)

    # Extract the article title from the V2EXTRASXML column, which is between <PAGE_TITLE> and </PAGE_TITLE> 
    df['article_title'] = df['V2EXTRASXML'].str.extract(r'<PAGE_TITLE>(.*?)</PAGE_TITLE>', expand=False)

    # Split V1.5TONE into multiple columns using , as the delimiter
    df[['Tone','Positive Score','Negative Score',
        'Polarity','Activity Reference Density',
        'Self/Group Reference Density','Word Count']] = df['V1.5TONE'].str.split(',', expand=True)
    # Convert the tone columns to numeric, coercing errors to 0
    df[['Tone','Positive Score','Negative Score','Polarity','Activity Reference Density',
        'Self/Group Reference Density','Word Count']] = df[[
        'Tone','Positive Score','Negative Score','Polarity','Activity Reference Density',
        'Self/Group Reference Density','Word Count']].apply(pd.to_numeric, downcast="integer", errors='coerce').fillna(0)

    df['V2GCAM'] = df['V2GCAM'].str.split(',')

    # Keep the first time that a V2DOCUMENTIDENTIFIER value appears
    df = df.sort_values(by=['V2DOCUMENTIDENTIFIER', 'datetime']).drop_duplicates(subset='V2DOCUMENTIDENTIFIER', keep='first')

    df.drop(columns=['V2.1DATE','V1LOCATIONS','V2EXTRASXML','V1.5TONE','V1ORGANIZATIONS'], inplace=True)

    # Handling GCAM attributes
    def list_to_dict(list):
        return {item.split(':')[0]: item.split(':')[1] for item in list if ':' in item}
    df['GCAM'] = df['V2GCAM'].apply(list_to_dict)

    # Now create the columns. Documentation says if the value is missing, it should be 0
    for i in range(len(doc_dict['Variable'])):
        key = doc_dict['Variable'][i]
        column_name = doc_dict['new_name'][i]
        df[column_name] = df['GCAM'].apply(lambda x: x.get(key) if x.get(key) is not None else 0)
        # Convert to numeric
        df[column_name] = pd.to_numeric(df[column_name], downcast="integer", errors='coerce').fillna(0)

    df.drop(columns=['V2GCAM','GCAM'], inplace=True)

    return(df)

In [None]:
def gdelt_wrapper(output_path = '../data/processed/',
        start = datetime.datetime(2018, 1, 1, 0, 0, 0),
        end = datetime.datetime(2025, 5, 1, 0, 0, 0),
        chunk_size = 100,
        num_chunks = 1,
        max_workers = 10,
        seed = 1234
        ):
    
    urls = generate_url_list(start, end, 15)
    # Randomly shuffle the urls
    random.seed(seed)
    random.shuffle(urls)

    chunked_datetime_strings = chunk_datetime_strings(urls, chunk_size)
    chunks = chunked_datetime_strings[0:num_chunks]

    for i in range(num_chunks):
        df = download_chunk_parallel(chunks[i], max_workers)
        df = first_pass_clean(df)
        df = second_pass_clean(df)

        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"../data/processed/gdelt_cleaned_chunk{i}_of_{num_chunks}_{timestamp}.csv"
        df.to_csv(output_file, index=False)

In [24]:
# try downloading, cleaning, and saving the first chunk
start = datetime.datetime(2018, 1, 1, 0, 0, 0)
end = datetime.datetime(2025, 5, 1, 0, 0, 0)
urls = generate_url_list(start, end, 15)

# Randomly shuffle the urls
random.seed(1234)  # For reproducibility
random.shuffle(urls)

chunked_datetime_strings = chunk_datetime_strings(urls, 10)
chunks = chunked_datetime_strings[0:2]

for chunk in chunks:
    df = download_chunk_parallel(chunk, max_workers=10)
    df = first_pass_clean(df)

    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"../data/processed/gdelt_cleaned_{timestamp}.csv"
    df.to_csv(output_file, index=False)


Downloading: 100%|██████████| 10/10 [00:03<00:00,  3.26file/s]
Downloading: 100%|██████████| 10/10 [00:03<00:00,  2.75file/s]


In [92]:
# More extensive cleaning
df2=pd.read_csv(r"../data/processed/gdelt_cleaned_20250529_142718.csv")

df2['V2SOURCECOLLECTIONIDENTIFIER'] = df2['V2SOURCECOLLECTIONIDENTIFIER'].astype(np.int8)

# dates
df2['datetime'] = pd.to_datetime(df2['V2.1DATE'], format='%Y%m%d%H%M%S', errors='coerce')
df2['date'] = df2['datetime'].dt.date

# company Dummies
to_check= ["airplane","airline","airport","Alaska Airlines","American Airlines","Delta Air Lines","Frontier Airlines","Hawaiian Airlines","JetBlue","Southwest Airlines","Spirit Airlines","Sun Country Airlines","United Airlines","Allegiant Air"]
for word in to_check:
    df2[word] = df2['V1ORGANIZATIONS'].str.contains(word, case=False, na=False).astype(np.int8)

# Extract the article title from the V2EXTRASXML column, which is between <PAGE_TITLE> and </PAGE_TITLE> 
df2['article_title'] = df2['V2EXTRASXML'].str.extract(r'<PAGE_TITLE>(.*?)</PAGE_TITLE>', expand=False)

# Split V1.5TONE into multiple columns using , as the delimiter
df2[['Tone','Positive Score','Negative Score',
     'Polarity','Activity Reference Density',
     'Self/Group Reference Density','Word Count']] = df2['V1.5TONE'].str.split(',', expand=True)
# Convert the tone columns to numeric, coercing errors to 0
df2[['Tone','Positive Score','Negative Score','Polarity','Activity Reference Density',
     'Self/Group Reference Density','Word Count']] = df2[[
     'Tone','Positive Score','Negative Score','Polarity','Activity Reference Density',
     'Self/Group Reference Density','Word Count']].apply(pd.to_numeric, downcast="integer", errors='coerce').fillna(0)

df2['V2GCAM'] = df2['V2GCAM'].str.split(',')

# Keep the first time that a V2DOCUMENTIDENTIFIER value appears
df2 = df2.sort_values(by=['V2DOCUMENTIDENTIFIER', 'datetime']).drop_duplicates(subset='V2DOCUMENTIDENTIFIER', keep='first')

df2.drop(columns=['V2.1DATE','V1LOCATIONS','V2EXTRASXML','V1.5TONE','V1ORGANIZATIONS'], inplace=True)

In [93]:
# Handling GCAM attributes
def list_to_dict(list):
    return {item.split(':')[0]: item.split(':')[1] for item in list if ':' in item}

df2['GCAM'] = df2['V2GCAM'].apply(list_to_dict)
#df2['GCAM']

# Now create the columns. Documentation says if the value is missing, it should be 0
for i in range(len(doc_dict['Variable'])):
    key = doc_dict['Variable'][i]
    column_name = doc_dict['new_name'][i]
    df2[column_name] = df2['GCAM'].apply(lambda x: x.get(key) if x.get(key) is not None else 0)
    # Convert to numeric
    df2[column_name] = pd.to_numeric(df2[column_name], downcast="integer", errors='coerce').fillna(0)

df2.drop(columns=['V2GCAM','GCAM'], inplace=True)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"../data/processed/example_cleaned_{timestamp}.csv"
df2.to_csv(output_file, index=False)

In [83]:

df2

Unnamed: 0,GKGRECORDID,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V2.1TRANSLATIONINFO,datetime,date,airplane,airline,...,v42.2; SCOREDVALUE; care_p,v42.3; SCOREDVALUE; fairness_p,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent
5492,20190731204500-1571,1.0,1310kfka.com,http://1310kfka.com/abc-news/u-s-news/250886/,ARREST;SOC_POINTSOFINTEREST;SOC_POINTSOFINTERE...,,2019-07-31 20:45:00,2019-07-31,False,True,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8006,20190306021500-1727,1.0,1310kfka.com,http://1310kfka.com/tsa-confiscates-rocket-pro...,WB_135_TRANSPORT;CRISISLEX_C07_SAFETY;SOC_USSE...,,2019-03-06 02:15:00,2019-03-06,False,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
431,20180622110000-1500,1.0,13wham.com,http://13wham.com/news/nation-world/confusion-...,SOC_POINTSOFINTEREST;SOC_POINTSOFINTEREST_AIRP...,,2018-06-22 11:00:00,2018-06-22,False,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1551,20180110100000-1052,1.0,234vibesnews.com,http://234vibesnews.com/2018/01/10/judge-ends-...,TAX_FNCACT;TAX_FNCACT_RANCHER;,,2018-01-10 10:00:00,2018-01-10,False,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1545,20180110100000-928,1.0,234vibesnews.com,http://234vibesnews.com/2018/01/10/norwegian-c...,UNGP_FORESTS_RIVERS_OCEANS;MARITIME_INCIDENT;M...,,2018-01-10 10:00:00,2018-01-10,False,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,20220225100000-710,1.0,india.com,https://zeenews.india.com/aviation/tata-to-upg...,TAX_FNCACT;TAX_FNCACT_EXECUTIVES;EPU_ECONOMY_H...,,2022-02-25 10:00:00,2022-02-25,False,True,...,0.096344,0.086119,0.079137,0.080076,0.071403,-0.106923,-0.087027,-0.034513,-0.052951,-0.131347
4384,20231023053000-741,1.0,india.com,https://zeenews.india.com/aviation/us-state-de...,GENERAL_GOVERNMENT;EPU_POLICY;EPU_POLICY_GOVER...,,2023-10-23 05:30:00,2023-10-23,False,False,...,0.123652,0.103561,0.098676,0.112102,0.086362,-0.170148,-0.094166,-0.098040,-0.114528,-0.147740
7895,20200716131500-742,1.0,india.com,https://zeenews.india.com/international-busine...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;TAX_FNCAC...,,2020-07-16 13:15:00,2020-07-16,False,True,...,0.108653,0.106998,0.091408,0.095910,0.074126,-0.112844,-0.029000,-0.016205,-0.043743,-0.059587
6832,20230605053000-1062,1.0,india.com,https://zeenews.india.com:443/aviation/us-figh...,ARMEDCONFLICT;MANMADE_DISASTER_IMPLIED;TAX_FNC...,,2023-06-05 05:30:00,2023-06-05,False,True,...,0.085124,0.081160,0.072742,0.079976,0.056046,-0.106963,-0.056748,-0.056214,-0.029065,-0.069301


In [68]:
# Split the V1THEMES field by semicolon, explode the data, and show value counts
themes = df2['V1THEMES'].str.split(';').explode().str.strip()
themes_counts = themes.value_counts().reset_index()
themes_counts

Unnamed: 0,V1THEMES,count
0,,99
1,TAX_FNCACT,87
2,WB_135_TRANSPORT,81
3,SOC_POINTSOFINTEREST,72
4,WB_1803_TRANSPORT_INFRASTRUCTURE,71
...,...,...
684,TAX_FNCACT_PROSECUTORS,1
685,WB_167_PORTS,1
686,TAX_FNCACT_AUTHORITIES,1
687,EPU_POLICY_AUTHORITIES,1


In [70]:
themes = pd.read_excel(r"C:\Users\Steven\Downloads\GDELT-Global_Knowledge_Graph_CategoryList.xlsx")
themes

Unnamed: 0,Type,Name,Date Added,Modified History,Description
0,Theme,ACT_FORCEPOSTURE,2014-10-01,2014-10-01,Actions relating to changes in force posture
1,Theme,ACT_HARMTHREATEN,2014-10-01,2014-10-01,Actions relating to harming or threatening
2,Theme,ACT_MAKESTATEMENT,2014-10-01,2014-10-01,Actions relating to making a statement
3,Theme,ACT_YIELD,2014-10-01,2014-10-01,Actions relating to yielding
4,Count,AFFECT,2013-10-13,2013-10-13,This broad category captures everything from b...
...,...,...,...,...,...
278,Theme,VIOLENT_UNREST,2013-10-13,2013-10-13,"Discussion of violent unrest, from rubber bull..."
279,Theme,WATER_SECURITY,2013-10-13,2013-10-13,"Drought, water access, lack of rain, clean wat..."
280,Theme,WHISTLEBLOWER,2013-10-13,2013-10-13,"Whistleblowers, document leaks, etc"
281,Theme,WMD,2013-10-13,2013-10-13,"Weapons of mass distruction, from nuclear to b..."
