In [2]:
import pandas as pd
import numpy as np
import os
import glob
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil import parser
import concurrent.futures
import time
import csv
import re
import os
headers = ['gkg_id', 'date', 'source', 'source_name', 'doc_id', 
        'themes', 'locations', 'persons', 'orgs', 
        'tone', 'pos', 'neg', 'polarity', 'ard', 'srd',
        'wc', 
        'lexicode_neg', 'lexicode_pos', # c3.*
        'MACROECONOMICS', 'ENERGY', 'FISHERIES', 
        'TRANSPORTATION', 'CRIME', 'SOCIAL_WELFARE',
        'HOUSING', 'FINANCE', 'DEFENCE', 'SSTC',
        'FOREIGN_TRADE', 'CIVIL_RIGHTS', 
        'INTL_AFFAIRS', 'GOVERNMENT_OPS',
        'LAND-WATER-MANAGEMENT', 'CULTURE',
        'PROV_LOCAL', 'INTERGOVERNMENTAL',
        'CONSTITUTIONAL_NATL_UNITY', 'ABORIGINAL',
        'RELIGION', 'HEALTHCARE', 'AGRICULTURE',
        'FORESTRY', 'LABOUR', 'IMMIGRATION',
        'EDUCATION', 'ENVIRONMENT',
        'finstab_pos', 'finstab_neg', 'finstab_neutral',
        'finsent_neg', 'finsent_pos', 'finsent_unc',
        'opin_neg', 'opin_pos',
        'sent_pos', 'sent_neg', 'sent_pol'
]

In [3]:
def merge_csv(f, y):
    start = time.time()

    
    header_written = False
    ''' Pandas method takes a longer timer for merge and hence have made it in pure python to boost up'''
    start = time.time()
    with open(f'../gdelt_csv/gdelt_{f}.csv', 'w', newline="") as fout:                         # merge csv
        wout = csv.writer(fout, delimiter=',')
        files = [x for x in glob.glob(f"../gdelt-{f}/{y}*.csv") if x != f'gdelt_{f}.csv']
        for file in files:
#             print("processing {}".format(file))
            with open(file) as fin:
                cr = csv.reader(fin,delimiter=',')
                if not header_written:
                    wout.writerow(headers)
                    header_written = True
                wout.writerows(cr) 
    
    
    end = time.time()
    csvv = len(glob.glob(f"../gdelt-{f}/{y}*.csv"))
    print(f"Finished processing {csvv} csv's for {f} which took {round(end-start,2)} seconds")

In [17]:
%%time

merge_csv('nz', 202)
merge_csv('au', 202)
merge_csv('uk', 202)
merge_csv('us', 202)

Finished processing 41943 for nz which took 84.49 seconds
Finished processing 32521 for au which took 160.44 seconds
Finished processing 21458 for uk which took 168.15 seconds
Finished processing 9924 for us which took 145.0 seconds
CPU times: user 6min 47s, sys: 22.8 s, total: 7min 10s
Wall time: 9min 18s


In [4]:
col = ['date', 'source', 'source_name', 
        'themes', 'locations', 'orgs', 
        'tone', 'pos', 'neg', 'polarity', 'wc',
]

def read_files(f):
    start = time.time()
    custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d%H%M%S")
    chunk = pd.read_csv(f"../gdelt_csv/gdelt_{f}.csv", parse_dates=['date'], date_parser = custom_date_parser, usecols=col, chunksize=1000000)    

    df = pd.concat(chunk)
    del chunk
    df['month'] = df['date'].dt.strftime('%b')
    cats = ['Jan', 'Feb', 'Mar', 'Apr','May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    df['month'] = pd.Categorical(df['month'], ordered=True, categories=cats)
#     df['month'] = df['month'].astype('int8')
    num = df.select_dtypes('number').columns
    df[num] = df[num].apply(lambda x: pd.to_numeric(x, downcast='float'))
    
    df.loc[:, df.dtypes == 'object'] =\
    df.select_dtypes(['object'])\
    .apply(lambda x: x.astype('category'))

    end = time.time()
    print(f"Read csv with chunks for {f}: ",round(end-start,2),"sec")

    
    return df

In [5]:
%%time
df_nz = read_files('nz')
df_au = read_files('au')
df_uk = read_files('uk')
df_us = read_files('us')


Read csv with chunks for nz:  0.01 sec
Read csv with chunks for au:  0.01 sec
Read csv with chunks for uk:  0.0 sec
Read csv with chunks for us:  0.0 sec
CPU times: user 4min 3s, sys: 14.5 s, total: 4min 17s
Wall time: 4min 48s


In [5]:
print(df_nz.info())
print(df_au.info())
print(df_uk.info())
print(df_us.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901499 entries, 0 to 901498
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   gkg_id        901499 non-null  object        
 1   date          901499 non-null  datetime64[ns]
 2   source        901499 non-null  int64         
 3   source_name   901491 non-null  object        
 4   doc_id        901499 non-null  object        
 5   themes        775886 non-null  object        
 6   locations     901499 non-null  object        
 7   persons       786844 non-null  object        
 8   orgs          767003 non-null  object        
 9   tone          901498 non-null  float64       
 10  pos           901498 non-null  float64       
 11  neg           901498 non-null  float64       
 12  polarity      901498 non-null  float64       
 13  ard           901498 non-null  float64       
 14  srd           901498 non-null  float64       
 15  wc            901

In [1]:
# due to less RAM unable to save as 

In [None]:
df_nz.to_feather('../gdelt_feather/df_nz.ft')


In [None]:
df_au.to_feather('../gdelt_feather/df_au.ft')


In [None]:
df_uk.to_feather('../gdelt_feather/df_uk.ft')


In [None]:
df_us.to_feather('../gdelt_feather/df_us.ft')