<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1">Load Data</a></span></li><li><span><a href="#Clean-data" data-toc-modified-id="Clean-data-2">Clean data</a></span></li><li><span><a href="#Test-Run" data-toc-modified-id="Test-Run-3">Test Run</a></span></li></ul></div>

## Load Data

In [1]:
import pandas as pd
import time, datetime
import multiprocessing as mp
from urllib.error import HTTPError

def get_file_list(start, end):
    all_files = []
    url_1 = "http://data.gdeltproject.org/gdeltv2/" # first part of gdelt data link
    url_3 = ".gkg.csv.zip" # file type
    end_time = pd.to_datetime(int(end), format='%Y%m%d%H%M%S')
    start_time = pd.to_datetime(int(start), format='%Y%m%d%H%M%S')

    while start_time < end_time:
        url_2 = start_time.strftime("%Y%m%d%H%M%S")
        url = url_1 + url_2 + url_3 # combine URL parts
        all_files.append(url)
        start_time += datetime.timedelta(minutes = 15) # test adding 15 mins to timestamp
        
    return all_files
    
def load_all_files(file_list):
    # blank list for files
    list_df = []
    bad_cols = [0,2,6,15,16,17,18,19,20,21,22,24,25,26]
    cols = list(range(0,26))
    #good_cols = [i for i in cols if i not in bad_cols]
    good_cols = [1,3,4]
    
        # loop through all file names and load
    for filename in file_list:
        print(filename)
        try:
            df = pd.read_csv(filename, sep="\t", header=None, on_bad_lines="warn", encoding = "ISO-8859-1", usecols = [i for i in good_cols])
            list_df.append(df)
#             df = pd.read_csv(filename, sep="\t", header=None, on_bad_lines="warn", encoding= 'unicode_escape', usecols = [i for i in good_cols])
#             list_df.append(df)
        except UnicodeDecodeError:
            df = pd.read_csv(filename, sep="\t", header=None, on_bad_lines="warn", encoding= 'unicode_escape', usecols = [i for i in good_cols])
            list_df.append(df)
#             df = pd.read_csv(filename, sep="\t", header=None, on_bad_lines="warn", encoding = "ISO-8859-1", usecols = [i for i in good_cols])
#             list_df.append(df)
        except UnicodeDecodeError:
            df = pd.read_csv(filename, sep="\t", header=None, on_bad_lines="warn", usecols = [i for i in good_cols])
            list_df.append(df)
        except HTTPError:
            continue
        
    
    # combine list of dataframes
    frame = pd.concat(list_df, axis=0, ignore_index=True)
    
    return frame

## Clean data

In [2]:
def clean_data(raw_data):
    df = raw_data[raw_data[1].notna()] # remove NA rows
    df[1] = [str(int(float(i))) for i in df[1]] # convert date time stamp to str
    year = df[1].str[0:4].astype(int) # extract year
    month = df[1].str[4:6].astype(int) # extract month
    day = df[1].str[6:8].astype(int) # extract day
    df.insert(0, 'year', year) # create year column
    df.insert(1, 'month', month) # create month column
    df.insert(2, 'day', day) # create day column
    df = df.rename(columns={3: "website", 4: 'url'}) # rename columns
    df = df.drop(1, axis=1) # drop old date time stamp
    website_list = ['reuters.com', 'yahoo.com', 'marketwatch.com', 'prnewswire.com']
    df = df[df.website.isin(website_list)].reset_index(drop=True)
    year_number = str(df.year.unique().item())
    month_number = str(df.month.unique().item())
    datetime_object = datetime.datetime.strptime(month_number, "%m")
    month_name = datetime_object.strftime("%b").lower()
    file_name = 'gdelt_'+month_name+str(year_number)+'.csv'
    df.to_csv('../00-data/gdelt_data/2021/'+file_name)
    #df.to_csv(file_name)

## Test Run

In [4]:
tic = time.perf_counter()
all_files = get_file_list("20220201000000", "20220201003000")
raw_df = []
raw_df = load_all_files(all_files)
toc = time.perf_counter()
print(f"Downloaded and appended in {(toc - tic)/60:0.2f} minutes")
tic = time.perf_counter()
clean_data(raw_df)
toc = time.perf_counter()
print(f"Cleaned in {(toc - tic)/60:0.2f} minutes")

http://data.gdeltproject.org/gdeltv2/20220201000000.gkg.csv.zip
http://data.gdeltproject.org/gdeltv2/20220201001500.gkg.csv.zip
Downloaded and appended in 0.02 minutes
Cleaned in 0.00 minutes
