In [None]:
# Libraries 

import os
import json
import pickle
import datetime
import pandas as pd
from tqdm.notebook import tqdm

from pymongo import MongoClient

tqdm.pandas()

# Starting MongoDB
password = '' # YOUR SYSTEM PASSWORD
mongod_restart_command = "sudo -S systemctl restart mongod"
os.system('echo %s | %s' % (password, mongod_restart_command))

In [17]:
# Functions 

def clean_events(df): 
    
    df['GLOBALEVENTID'] = df["GLOBALEVENTID"].progress_apply(lambda x: str(x))
    df["ScrapeDate"] = df['DATEADDED'].progress_apply(lambda x: StrToDate(x)) 
    df["EventDate"] = df['SQLDATE'].progress_apply(lambda x: StrToDate(x)) 
    df = df.drop(['SQLDATE', 'MonthYear', 'Year', 'FractionDate', 'DATEADDED'], axis = 1) 
    df["GoldsteinScale"] = df['GoldsteinScale'].astype(float) 
    df["AvgTone"] = df['AvgTone'].astype(float) 
    df["EventName"] = df['EventCode'].progress_apply(lambda x: CAMEO_dict[x]) 
    df['EventName'] = df['EventName'].progress_apply(lambda x: x.split("-")[0]) 
    df["EventBaseName"] = df['EventBaseCode'].progress_apply(lambda x: CAMEO_dict[x].split("-")[0]) 
    df["EventRootName"] = df['EventRootCode'].progress_apply(lambda x: CAMEO_dict[x].split("-")[0])
    df["QuadClass"] = df['QuadClass'].progress_apply(lambda x: quad_class[x])
    df = df.drop(['EventBaseName', 'EventBaseCode', 'EventCode', 'EventRootCode'], axis = 1)
    df = df.drop([elm for elm in list(df.columns) if 'ADM' in elm or 'FeatureID' in elm], axis = 1)
    df["Actor1Geo_Type"] = df['Actor1Geo_Type'].progress_apply(lambda x: actor_type[x])
    df["Actor2Geo_Type"] = df['Actor2Geo_Type'].progress_apply(lambda x: actor_type[x])
    df["ActionGeo_Type"] = df['ActionGeo_Type'].progress_apply(lambda x: actor_type[x])
    
    df['Actor1ReligionCode'] = df[ 'Actor1Religion1Code'] + df['Actor1Religion2Code'].fillna('')
    df['Actor2ReligionCode'] = df[ 'Actor2Religion1Code'] + df['Actor2Religion2Code'].fillna('')

    df = df.drop(['Actor1Religion2Code', 'Actor2Religion2Code'], axis = 1)
    
    df["Actor1ReligionCode"] = df['Actor1ReligionCode'].progress_apply(lambda x: religions[x] if x in list(religions.keys()) else None)
    df["Actor2ReligionCode"] = df['Actor2ReligionCode'].progress_apply(lambda x: religions[x] if x in list(religions.keys()) else None)
    df[ 'Actor1Religion1Code'] = df['Actor1Religion1Code'].progress_apply(lambda x: religions[x] if x in list(religions.keys()) else None)
    df[ 'Actor2Religion1Code'] = df['Actor1Religion1Code'].progress_apply(lambda x: religions[x] if x in list(religions.keys()) else None)
    
    df['Actor1Code'] = df['Actor1Code'].progress_apply(lambda x: actor_codes[x] if x in list(actor_codes.keys()) else x)
    df['Actor2Code'] = df['Actor2Code'].progress_apply(lambda x: actor_codes[x] if x in list(actor_codes.keys()) else x)
    
    df['Actor1EthnicCode'] = df['Actor1EthnicCode'].progress_apply(lambda x: ethnic_codes[x] if x in list(ethnic_codes.keys()) else x)
    df['Actor2EthnicCode'] = df['Actor2EthnicCode'].progress_apply(lambda x: ethnic_codes[x] if x in list(ethnic_codes.keys()) else x)
    
    df['Actor1Type1Code'] = df['Actor1Type3Code'].progress_apply(lambda x: actor_type_codes[x] if x in list(actor_type_codes.keys()) else x).value_counts()
    df['Actor1Type2Code'] = df['Actor1Type3Code'].progress_apply(lambda x: actor_type_codes[x] if x in list(actor_type_codes.keys()) else x).value_counts()
    df['Actor1Type3Code'] = df['Actor1Type3Code'].progress_apply(lambda x: actor_type_codes[x] if x in list(actor_type_codes.keys()) else x).value_counts()
    df['Actor2Type1Code'] = df['Actor1Type3Code'].progress_apply(lambda x: actor_type_codes[x] if x in list(actor_type_codes.keys()) else x).value_counts()
    df['Actor2Type2Code'] = df['Actor1Type3Code'].progress_apply(lambda x: actor_type_codes[x] if x in list(actor_type_codes.keys()) else x).value_counts()
    df['Actor2Type3Code'] = df['Actor1Type3Code'].progress_apply(lambda x: actor_type_codes[x] if x in list(actor_type_codes.keys()) else x).value_counts()
    
    return df

def StrToDate(str_date): 
    
    str_date = str(str_date)
    
    if len(str_date) == 8:
        year, month, day = int(str_date[0:4]), int(str_date[4:6]), int(str_date[6:8])
        datetime_date = datetime.datetime(year, month, day).date()
        
    if len(str_date) == 14:
        year, month, day, hour, minute, second = int(str_date[0:4]), int(str_date[4:6]), int(str_date[6:8]), int(str_date[8:10]), int(str_date[10:12]), int(str_date[12:14]) 
        datetime_date = datetime.datetime(year, month, day, hour, minute, second)
                
    return datetime_date

def insert_mongoDB(df, localhost, database, collection): 
    
    # Making a Connection with MongoClient
    client = MongoClient("mongodb://localhost:" + localhost + "/")
    
    # Database
    db = client[database]
    
    # Collection
    col = db[collection]
    
    # Load to mongoDB 
    col.insert_many(df.to_dict('records'))

In [None]:
# GDELT files 

with open('data/GDELT Files/CAMEO.json', 'r') as fp: 
    CAMEO_dict = json.load(fp)
    
CAMEO_dict["1213"] = "reject judicial cooperation"
CAMEO_dict["1725"] = "impose administrative sanctions"
CAMEO_dict["---"] = "unknown"
CAMEO_dict["--"] = "unknown"
CAMEO_dict["-"] = "unknown"

GCAM = pd.read_excel('data/GDELT Files/GCAM.xlsx', index_col = 0)

COUNTRY_dict = pd.read_csv("data/GDELT Files/CAMEO.country.txt", sep = '\t').set_index("LABEL").to_dict()['CODE']
COUNTRY_dict_reverse = {v: k for k, v in COUNTRY_dict.items()}

GCAM = pd.read_excel("data/GDELT Files/GCAM.xlsx", index_col = 0)

quad_class = {1:'verbal  cooperation', 2:'material  cooperation', 3:'verbal conflict', 4:'material conflict'}

actor_type = {0:None, 1:'COUNTRY', 2:'USSTATE', 3:'USCITY', 4:'WORLDCITY', 5:"WORLDSTATE"}

with open('data/GDELT Files/religions.pickle', 'rb') as handle: 
    religions = pickle.load(handle)
    
with open('data/GDELT Files/actor_codes.pickle', 'rb') as handle: 
    actor_codes = pickle.load(handle)
    
with open('data/GDELT Files/ethnic_codes.pickle', 'rb') as handle: 
    ethnic_codes = pickle.load(handle)   
         
with open('data/GDELT Files/actor_type_codes.pickle', 'rb') as handle: 
    actor_type_codes = pickle.load(handle)   

In [None]:
# GDELT data 

gdelt = pd.read_feather('GDELT_DATA.fthr')

In [None]:
# Clean rows 
clean_gdelt = clean_events(gdelt)

# Filter columns
filtered_clean_gdelt = clean_gdelt[['GLOBALEVENTID', 'Actor1CountryCode', 'Actor2CountryCode', 'IsRootEvent', 
                                    'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources', 'NumArticles',
                                    'AvgTone', 'ActionGeo_CountryCode', 'SOURCEURL', 'ScrapeDate', 'EventDate',
                                    'EventName', 'EventRootName']]

In [19]:
filtered_clean_gdelt.to_feather('FORMATTED_GDELT_DATA.fthr')