In [2]:
#run if gdeltPyR is not installed yet
pip install gdelt

here


In [1]:
import pandas as pd
import gdelt
import datetime
import time

here


In [2]:
def gd_crawler(start_date, end_date):
    # Initialize GDELT database connection
    gd2 = gdelt.gdelt(version=2)

    # Convert start and end date strings to datetime objects
    start_datetime = datetime.datetime.strptime(start_date, '%Y %m %d')
    end_datetime = datetime.datetime.strptime(end_date, '%Y %m %d')

    # Calculate the number of days between start and end dates
    delta = end_datetime - start_datetime
    num_days = delta.days + 1  # Include the end date

    print(f"Starting download for {num_days} days now...")

    # Create a log file
    log_file = open('gd_crawler_log.txt', 'a')  # Open for appending
    print("Log-file created!")
    
    # Initialize time measurement
    start_time = time.time()
    
    for i in range(num_days):
        # Calculate the current date in the loop
        current_date = start_datetime + datetime.timedelta(days=i)
        current_date_str = current_date.strftime('%Y %m %d')

        try:
            # Perform the GDELT search for the current date
            results = gd2.Search([current_date_str], table='events', coverage=True)

            # Count the total number of entries for that day
            entries = len(results)

            # Apply filters here ########################################################################
            results_filtered = results.dropna(subset=['Actor1CountryCode', 'QuadClass'])
            results_filtered = results_filtered[(results_filtered['Actor1CountryCode'] == 'CHN') | (results_filtered['Actor1CountryCode'] == 'TWN')]
            results_filtered = results_filtered[(results_filtered['QuadClass'] == 3) | (results_filtered['QuadClass'] == 4)]

            #######################################################################################
                
            # Count the number of matching entries and the percentage
            match = len(results_filtered)
            match_percent = match / entries * 100

            # Create control DataFrame
            control = pd.DataFrame({
                'Date': [current_date],
                'Date String': [current_date_str],
                'Entries': [entries],
                'Matches': [match],
                'Match Percentage': [match_percent]
            })

            # Load and write Control CSV File
            control_file_path = f'{start_date} - {end_date} - control.csv'

            # Load the existing CSV file or create a new DataFrame
            try:
                all_control = pd.read_csv(control_file_path, parse_dates=['Date'], low_memory=False)
            except FileNotFoundError:
                all_control = pd.DataFrame(columns=['Date', 'Date String', 'Entries', 'Matches', 'Match Percentage'])

            # Concatenate the filtered results to the existing DataFrame
            all_control = pd.concat([all_control, control])

            # Save the DataFrame to a CSV file
            all_control.to_csv(control_file_path, index=False)

            # Load the existing CSV file or create a new DataFrame
            try:
                all_results = pd.read_csv(f'{start_date} - {end_date}.csv', parse_dates=['SQLDATE'], low_memory=False)
            except FileNotFoundError:
                all_results = pd.DataFrame(columns=results.columns)

            # Concatenate the filtered results to the existing DataFrame
            all_results = pd.concat([all_results, results_filtered])

            # Save the DataFrame to a CSV file
            all_results.to_csv(f'{start_date} - {end_date}.csv', index=False)

            # Print progress information
            elapsed_time = time.time() - start_time
            print(f"Processed date: {current_date_str}, {i+1}/{num_days} days done. Time taken = {elapsed_time:.2f} seconds")
            log_file.write(f"Processed date: {current_date_str}, {i+1}/{num_days} days done. Time taken = {elapsed_time:.2f} seconds\n")
            
            # Reset timer
            start_time = time.time()

        except Exception as e:
            # Handle exceptions and continue to the next date
            print(f"Error processing date {current_date_str}: {e}")
            log_file.write(f"Error processing date {current_date_str}: {e}\n")
            continue

    # Close the log file
    log_file.close()


In [3]:
# Apply your filters in the filter section above, and run the function with start and end date as string:
gd_crawler('2022 04 15', '2022 04 18')

Starting download for 4 days now...
Log-file created!
Processed date: 2022 04 15, 1/4 days done. Time taken = 12.25 seconds
Processed date: 2022 04 16, 2/4 days done. Time taken = 10.92 seconds
Processed date: 2022 04 17, 3/4 days done. Time taken = 9.94 seconds
Processed date: 2022 04 18, 4/4 days done. Time taken = 10.57 seconds


In [5]:
example = pd.read_csv("2022 04 15 - 2022 04 18.csv")
example.head()

Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_ADM2Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,1039489269,2022-04-15 00:00:00,202204,2022,2022.2877,CHN,CHINA,CHN,,,...,4,"Beijing, Beijing, China",CH,CH22,13001,39.9289,116.388,-1898541,20220415011500,https://menafn.com/1104017927/From-handshakes-...
1,1039489272,2022-04-15 00:00:00,202204,2022,2022.2877,CHN,CHINESE,CHN,,,...,4,"Shenzhen, Guangdong, China",CH,CH30,13036,22.2,111.117,-1925267,20220415011500,https://www.caixinglobal.com/2022-04-15/cx-dai...
2,1039489273,2022-04-15 00:00:00,202204,2022,2022.2877,CHN,CHINA,CHN,,,...,4,"Ningde, Xizang, China",CH,CH14,13300,31.1,91.0,11275278,20220415011500,https://www.register-herald.com/region/anti-vi...
3,1039489277,2022-04-15 00:00:00,202204,2022,2022.2877,CHN,CHINA,CHN,,,...,4,"Beijing, Beijing, China",CH,CH22,13001,39.9289,116.388,-1898541,20220415011500,https://menafn.com/1104017927/From-handshakes-...
4,1039480582,2022-04-15 00:00:00,202204,2022,2022.2877,CHN,CHINA,CHN,,,...,4,"Sydney, New South Wales, Australia",AS,AS02,154637,-33.8833,151.217,-1603135,20220415001500,https://www.9news.com.au/national/large-house-...


In [6]:
metrics = pd.read_csv("2022 04 15 - 2022 04 18 - control.csv")
metrics.head()

Unnamed: 0,Date,Date String,Entries,Matches,Match Percentage
0,2022-04-15,2022 04 15,104033,658,0.632492
1,2022-04-16,2022 04 16,70136,319,0.454831
2,2022-04-17,2022 04 17,63671,200,0.314115
3,2022-04-18,2022 04 18,95848,351,0.366205
