In [2]:
import pandas as pd
import time, datetime
import multiprocessing as mp

- function to produce a list of file URLs

In [3]:
def get_file_list(start, end):
    all_files = []
    url_1 = "http://data.gdeltproject.org/gdeltv2/" # first part of gdelt data link
    url_3 = ".gkg.csv.zip" # file type
    current_time = pd.to_datetime(int(end), format='%Y%m%d%H%M%S')
    date_time = pd.to_datetime(int(start), format='%Y%m%d%H%M%S')

    while date_time <= current_time:
        url_2 = date_time.strftime("%Y%m%d%H%M%S")
        url = url_1 + url_2 + url_3 # combine URL parts
        all_files.append(url)
        date_time += datetime.timedelta(minutes = 15) # test adding 15 mins to timestamp
        
    return all_files

- function to read in all files produced by 'get_file_list' function

In [4]:
def load_all_files(file_list):
    # blank list for files
    list_df = []
    
    # loop through all file names and load
    for filename in file_list:
        df = pd.read_csv(filename, sep="\t", header=None, on_bad_lines="warn", encoding= 'unicode_escape')
        list_df.append(df)
    
    # combine list of dataframes
    frame = pd.concat(list_df, axis=0, ignore_index=True)
    
    return frame

- produce a simple list of 2 files

In [5]:
all_files = get_file_list("20210101000000", "20210101001500")
all_files

['http://data.gdeltproject.org/gdeltv2/20210101000000.gkg.csv.zip',
 'http://data.gdeltproject.org/gdeltv2/20210101001500.gkg.csv.zip']

- load data and produce data frame without parallel processing

In [6]:
tic = time.perf_counter()
######
df1 = load_all_files(all_files)
#######
toc = time.perf_counter()
print(f"Downloaded and appended in {(toc - tic)/60:0.2f} minutes")

Downloaded and appended in 0.02 minutes


In [7]:
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,20210101000000-0,2.021010e+13,1,10news.com,https://www.10news.com/news/national/coronavir...,,,TAX_ETHNICITY;TAX_ETHNICITY_CANADIAN;GENERAL_G...,"GENERAL_HEALTH,307;MEDICAL,307;GENERAL_GOVERNM...","1#Canada#CA#CA#60#-96#CA;4#Toronto, Ontario, C...",...,"wc:191,c12.1:10,c12.10:20,c12.12:6,c12.13:3,c1...",https://ewscripps.brightspotcdn.com/80/6b/c73c...,,,https://youtube.com/user/10newsvideos;,,"Affairs Minister Dominic LeBlanc,208;Bill Blai...",,,<PAGE_LINKS>https://apnews.com/article/dominic...
1,20210101000000-1,2.021010e+13,1,ketchikandailynews.com,https://www.ketchikandailynews.com/news/local/...,"KILL#2#new virusrelated#2#Alaska, United State...","KILL#2#new virusrelated#2#Alaska, United State...",CRISISLEX_CRISISLEXREC;CRISISLEX_O02_RESPONSEA...,"KILL,2569;KILL,2595;KILL,2809;KILL,2888;KILL,4...","3#Eagle River, Alaska, United States#US#USAK#6...",...,"wc:775,c12.1:26,c12.10:32,c12.12:6,c12.13:16,c...",https://bloximages.newyork1.vip.townnews.com/k...,,,,,"Emergency Operations,124;Alaska Department,175...","7,cases on Wednesday,54;6,cases involved Ketch...",,<PAGE_PRECISEPUBTIMESTAMP>20201231185400</PAGE...
2,20210101000000-2,2.021010e+13,1,grandforksherald.com,https://www.grandforksherald.com/news/accident...,,,TAX_FNCACT;TAX_FNCACT_CHILDREN;TAX_FNCACT_MAN;...,"TAX_FNCACT_DEPUTY,596;TAX_FNCACT_SPOKESPERSON,...","2#Minnesota, United States#US#USMN#45.7326#-93...",...,"wc:262,c12.1:15,c12.10:17,c12.12:8,c12.13:4,c1...",https://www.duluthnewstribune.com/incoming/474...,,,,,"Hill City,539;Aitkin County Sheriff,597;Hill C...","9,children,85;700,Airport Road,390;3,dies,709;...",,<PAGE_LINKS>https://www.duluthnewstribune.com/...
3,20210101000000-3,2.021010e+13,1,montrealgazette.com,https://montrealgazette.com/news/local-news/pr...,,,GENERAL_HEALTH;MEDICAL;TAX_FNCACT;TAX_FNCACT_M...,"GENERAL_HEALTH,24;GENERAL_HEALTH,484;GENERAL_H...","4#Quebec, Quebec, Canada#CA#CA10#47.5#-72#-571850",...,"wc:196,c12.1:9,c12.10:17,c12.12:3,c12.13:6,c12...",https://smartcdn.prod.postmedia.digital/montre...,,,https://youtube.com/user/themontrealgazette;ht...,,"Minister Christian Dub,47","2,doses,266;2,shots have been administered,311...",,<PAGE_LINKS>https://montrealgazette.com/news/l...
4,20210101000000-4,2.021010e+13,1,energeticcity.ca,https://energeticcity.ca/2020/12/31/elderly-co...,,,AFFECT;POVERTY;CRISISLEX_C05_NEED_OF_SHELTERS;...,"TAX_WORLDMAMMALS_CAT,578;WB_1609_FOOD_AND_IN_K...",,...,"wc:99,c12.1:3,c12.10:4,c12.12:1,c12.13:1,c12.1...",https://media.socastsrm.com/wordpress/wp-conte...,,,https://youtube.com/channel/UCj6EKBe990fanr4MS...,,"Red Creek Road,74;Annette Alexander,191",,,<PAGE_AUTHORS>Scott Brooks</PAGE_AUTHORS><PAGE...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2644,20210101001500-1282,2.021010e+13,1,kfsk.org,https://www.kfsk.org/2020/12/31/petersburg-bor...,,,,,,...,"wc:30,c16.1:1,c16.100:2,c16.106:2,c16.109:2,c1...",,,,https://youtube.com/watch?v=AtxyJpaFQN4;,,"Regular Borough Assembly,25;Assembly Chambers,64","6000000,respectively,107;",,<PAGE_LINKS>https://www.kfsk.org/borough-assem...
2645,20210101001500-1283,2.021010e+13,1,washingtontimes.com,https://www.washingtontimes.com/news/2020/dec/...,,,,,"2#New York, United States#US#USNY#42.1497#-74....",...,"wc:916,c1.1:1,c1.2:1,c12.1:58,c12.10:105,c12.1...",,,,,,"Dallas Cowboys,99;New York Giants,391;New York...","3,teams,793;10,takeaways,1052;12,games,1098;3,...",,<PAGE_LINKS>https://www.washingtontimes.com/co...
2646,20210101001500-1284,2.021010e+13,1,jtv.com,https://www.jtv.com/product/multi-gemstone-rho...,,,UNGP_FORESTS_RIVERS_OCEANS;MEDIA_MSM;SLFID_NAT...,"MEDIA_MSM,63;UNGP_FORESTS_RIVERS_OCEANS,7;UNGP...","1#China#CH#CH#35#105#CH;4#Manchuria, Nei Mongo...",...,"wc:175,c1.1:5,c1.4:1,c12.1:23,c12.10:17,c12.12...",https://images.jtv.com/jewelry/JTV-AEH119-1-me...,,,https://youtube.com/channel/UC9depPiZijkug50cD...,,"Jewelry Television,66;Arctic Ocean,124;Glacier...",,,<PAGE_TITLE>Multi-gemstone rhodium over silver...
2647,20210101001500-1285,2.021010e+13,1,bakersfield.com,https://www.bakersfield.com/columnists/herb-be...,,,TAX_ETHNICITY;TAX_ETHNICITY_GREEK;TAX_WORLDLAN...,"TAX_FNCACT_ENGINEER,1836;EPU_ECONOMY_HISTORIC,...",1#Greece#GR#GR#39#22#GR;1#Morocco#MO#MO#32#-5#...,...,"wc:504,c12.1:36,c12.10:51,c12.12:15,c12.13:22,...",https://bloximages.newyork1.vip.townnews.com/b...,,,,1136|27||Would you like to sit down?,"Eastern European,249;Cafe Smitten,304;Culver C...","2,couples,141;2,couples looked lost,363;100000...",,<PAGE_AUTHORS>HERB BENHAM hbenham@bakersfield....
