In [None]:
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
from functools import reduce
from time import strptime
import datetime as dt
import seaborn as sns
import pandas as pd
import numpy as np
import calendar
import swifter
import random

## ACLED Data

In [None]:
acled_df = pd.read_csv("data/2013-01-01-2018-12-31.csv")

In [None]:
len(acled_df)

In [None]:
acled_df.head(10)

In [None]:
acled_df.drop(
    ["data_id","iso","event_id_cnty","event_id_no_cnty",\
    "time_precision","actor1","assoc_actor_1","inter1",\
    "actor2","assoc_actor_2","inter2","interaction","region",\
    "admin1","admin2","admin3","geo_precision","source",\
    "source_scale","timestamp"]\
    ,axis = 1,inplace = True)

In [None]:
def get_week_day(date_string):
    date_object = datetime.strptime(date_string, "%d %B %Y")
    return calendar.day_name[(date_object).weekday()]

In [None]:
def get_month(date_string):
    return date_string.split(" ")[1]

In [None]:
acled_df["week_day"] = acled_df["event_date"].swifter.apply(get_week_day)

In [None]:
acled_df["month"] = acled_df["event_date"].swifter.apply(get_month)

In [None]:
acled_df.head(20)

In [None]:
acled_df.to_csv("Output/acled_data.csv")
acled_df.to_csv("Output/acled_data.js")

In [None]:
print(len(acled_df["country"].unique()))
country = acled_df["country"].unique()
country.sort()
country

In [None]:
acled_df["sub_event_type"].unique()

In [None]:
violent = acled_df.loc[acled_df["event_type"]=='Protests'].reset_index(drop=True)

In [None]:
nig_violent_df = violent.loc[violent["country"]=="Nigeria"].reset_index(drop=True)
nig_violent_df.head(5)

In [None]:
acled2018_data = acled_df.loc[acled_df["year"]==2018].reset_index(drop=True)

In [None]:
def get_grouped(term):
    grouped = acled2018_data[["country","iso3","event_type"]]\
                .groupby( ["country","iso3","event_type"] ).size().to_frame(name = 'count').reset_index()
    return grouped.loc[grouped["event_type"]==f'{term}'].drop("event_type",axis=1)\
                .rename(columns = {"count":f'{term}'}).reset_index(drop=True)

In [None]:
event_types = list(acled2018_data["event_type"].unique())
event_types

In [None]:
list_events = [get_grouped(f'{event}') for event in event_types]
merged_df = reduce(lambda x,y: pd.merge(x,y, on = ["country","iso3"]), list_events).\
                rename(columns={"iso3":"country_code",
                                "Violence against civilians":"violence",
                                "Strategic developments":"development",
                                "Battles":"battles",
                               })
merged_df.head()

In [None]:
a = get_grouped(event_types[0])
merged_df2 = pd.merge(a,get_grouped(event_types[2])).rename(columns={"iso3":"country_code",
                                "Violence against civilians":"Human Trafficking Events",
                                "Battles":"Battles Fought",
                               })
merged_df2.head()

In [None]:
merged_df.to_csv("Output/acled2018_events_type_data.csv")

## Function to parse UNICEF Data

In [None]:
def drop_and_rename(data,value):
    return data[["Geographic Area","Observation Value"]].dropna().\
            rename(columns={"Geographic Area":"country",
                            "Observation Value":f"{value}"})

## Asylum seekers, by country of destination in thousands of persons

In [None]:
asylm_df = pd.read_csv("data/GLOBAL_ASYLM_CNTRY_DEST._T._.csv")

In [None]:
# Asylum of seekers by country of destination, observation seekers in thousands
asylm_df = drop_and_rename(asylm_df,"Asylum Seekers")
for index, row in asylm_df.iterrows():
    try:
        if asylm_df.loc[index, "Asylum Seekers"] == "<1":
            asylm_df.loc[index, "Asylum Seekers"] = round(random.random(),2)
    except:
        pass    

In [None]:
asylm_df.head() 

## Total population by country in thousands of persons

In [None]:
pop_df = pd.read_csv("data/GLOBAL_POP_TOT._2018.csv")
pop_df = drop_and_rename(pop_df,"Population")

In [None]:
pop_df["Population"]=round(pop_df["Population"],2)

In [None]:
pop_df.head()

## Estimated number of annual AIDS-related deaths (adolescents and young people aged 15-24 years) in persons

In [None]:
hiv_df = pd.read_csv("data/GLOBAL_HIV.csv")
hiv_df = hiv_df.loc[hiv_df["Time Period"] == 2018]
hiv_df = drop_and_rename(hiv_df,"Aids Related Death").reset_index(drop=True)

In [None]:
for index, row in hiv_df.iterrows():
    try:
        if hiv_df.loc[index, "Aids Related Death"] == "<100":
            hiv_df.loc[index, "Aids Related Death"] = round((random.random()+4)*20,2)
        if hiv_df.loc[index, "Aids Related Death"] == "<200":
            hiv_df.loc[index, "Aids Related Death"] = round((random.random()+4)*40,2)
        if hiv_df.loc[index, "Aids Related Death"] == "<500":
            hiv_df.loc[index, "Aids Related Death"] = round((random.random()+4)*100,2)
    except:
        pass

hiv_df.head()

## Smoking death by country https://ourworldindata.org/smoking

In [None]:
smoking_death_df = pd.read_csv("data/share-deaths-smoking.csv")
smoking_death_df = smoking_death_df.loc[smoking_death_df["Year"]==2017].reset_index(drop=True)
smoking_death_df.rename(columns={"Entity":"country",
                            "Share of deaths from smoking (%)":"Death From Smoking"},inplace=True)
smoking_death_df = smoking_death_df[["country","Death From Smoking"]]
smoking_death_df.head()

## Global Life Expectancy in Years

In [None]:
life_exp = pd.read_csv("data/GLOBAL_LIFE_EXP._T._.csv")
life_exp_df = drop_and_rename(life_exp,"Life Expectancy").reset_index(drop=True)

In [None]:
life_exp_df["Life Expectancy"] = round(life_exp_df["Life Expectancy"],2)

In [None]:
life_exp_df.head()

## Merge everything

In [None]:
new_list_events = [merged_df2,pop_df,asylm_df,hiv_df,life_exp_df,smoking_death_df]
new_merged_df = reduce(lambda x,y: pd.merge(x,y, on = "country"), new_list_events)
new_merged_df

In [None]:
new_merged_df.to_csv("Output/merge_dataset.csv", index = False)

In [126]:
merge_dict = new_merged_df.to_dict('list')

In [129]:
import pymongo
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
# Define database and collection
db = client.merged_data
merged_data = db.merged_data

In [130]:
merged_data.remove()
merged_data.insert(merge_dict,check_keys=False)

  """Entry point for launching an IPython kernel.
  


ObjectId('5e87799911d91805f3725475')

In [133]:
merge_result = merged_data.find_one()
if '_id' in merge_result:
    del merge_result['_id']

In [134]:
merge_result

{'country': ['Afghanistan',
  'Algeria',
  'Angola',
  'Armenia',
  'Bangladesh',
  'Benin',
  'Bosnia and Herzegovina',
  'Burkina Faso',
  'Burundi',
  'Cambodia',
  'Cameroon',
  'Central African Republic',
  'Chad',
  'Egypt',
  'Equatorial Guinea',
  'Ethiopia',
  'Gabon',
  'Ghana',
  'Guinea',
  'Indonesia',
  'Israel',
  'Jordan',
  'Kazakhstan',
  'Kenya',
  'Kyrgyzstan',
  'Lebanon',
  'Liberia',
  'Libya',
  'Madagascar',
  'Malaysia',
  'Mali',
  'Mauritania',
  'Morocco',
  'Mozambique',
  'Myanmar',
  'Nepal',
  'Niger',
  'Nigeria',
  'Pakistan',
  'Philippines',
  'Rwanda',
  'Senegal',
  'Sierra Leone',
  'Somalia',
  'South Africa',
  'South Sudan',
  'Sri Lanka',
  'Sudan',
  'Thailand',
  'Togo',
  'Tunisia',
  'Uganda',
  'Ukraine',
  'Yemen',
  'Zambia',
  'Zimbabwe'],
 'country_code': ['AFG',
  'DZA',
  'AGO',
  'ARM',
  'BGD',
  'BEN',
  'BIH',
  'BFA',
  'BDI',
  'KHM',
  'CMR',
  'CAF',
  'TCD',
  'EGY',
  'GNQ',
  'ETH',
  'GAB',
  'GHA',
  'GIN',
  'IDN',
  

In [135]:
from bson import json_util



json.dumps(merge_result,default=json_util.default)

'{"country": ["Afghanistan", "Algeria", "Angola", "Armenia", "Bangladesh", "Benin", "Bosnia and Herzegovina", "Burkina Faso", "Burundi", "Cambodia", "Cameroon", "Central African Republic", "Chad", "Egypt", "Equatorial Guinea", "Ethiopia", "Gabon", "Ghana", "Guinea", "Indonesia", "Israel", "Jordan", "Kazakhstan", "Kenya", "Kyrgyzstan", "Lebanon", "Liberia", "Libya", "Madagascar", "Malaysia", "Mali", "Mauritania", "Morocco", "Mozambique", "Myanmar", "Nepal", "Niger", "Nigeria", "Pakistan", "Philippines", "Rwanda", "Senegal", "Sierra Leone", "Somalia", "South Africa", "South Sudan", "Sri Lanka", "Sudan", "Thailand", "Togo", "Tunisia", "Uganda", "Ukraine", "Yemen", "Zambia", "Zimbabwe"], "country_code": ["AFG", "DZA", "AGO", "ARM", "BGD", "BEN", "BIH", "BFA", "BDI", "KHM", "CMR", "CAF", "TCD", "EGY", "GNQ", "ETH", "GAB", "GHA", "GIN", "IDN", "ISR", "JOR", "KAZ", "KEN", "KGZ", "LBN", "LBR", "LBY", "MDG", "MYS", "MLI", "MRT", "MAR", "MOZ", "MMR", "NPL", "NER", "NGA", "PAK", "PHL", "RWA", "SE

## Exploratory Data Analysis

In [None]:
new_merged_df["Asylum Seekers"] = pd.to_numeric(new_merged_df["Asylum Seekers"], errors ='ignore')
new_merged_df["Aids Related Death"] = pd.to_numeric(new_merged_df["Aids Related Death"], errors ='ignore')

In [None]:
correlations = new_merged_df.drop(["country","country_code"],axis=1).corr()
# plot correlation matrix
fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = list( new_merged_df.drop(["country","country_code"],axis=1).columns)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(new_merged_df.drop(["country","country_code"],axis=1),figsize=(15,10))
plt.show()

In [None]:
pov_df = pd.read_csv("data/Poverty.csv").dropna().rename(columns= {'Unnamed: 0':'country', 
                                    'Unnamed: 2':'Pop. Below $1.90 per day',
                                    'Unnamed: 3':'Pop. Below $3.20 per day',
                                    'Unnamed: 4':'Pop. Below $5.50 per day'
                                   }).replace("..","NA")[["country","Population below $1.90 a day"]]
pov_df.head()

In [None]:
import json
with open("Output/normal_cases.json", 'r') as fp:
    normal_data = json.load(fp)
with open("Output/special_cases.json", 'r') as fp:
    special_data = json.load(fp)   

In [None]:
country = [list(i.keys())[0].title() for i in normal_data]
intro = [list(i.values())[0][0]['INTRO'] for i in normal_data]
tier = [list(i.values())[0][1]['TIER'] for i in normal_data]
recommendation = []
prosecution = []
protection = []
prevention = []
traffick_profile = []
for i in normal_data:
    try:
        recommendation.append(list(i.values())[0][2]['RECOMMENDATIONS'])
        prosecution.append(list(i.values())[0][3]['PROSECUTIONS'])
        protection.append(list(i.values())[0][4]['PROTECTION'])
        prevention.append(list(i.values())[0][5]['PREVENTION'])
        traffick_profile.append(list(i.values())[0][6]['TRAFFICKING PROFILE'])
    except:
        recommendation.append("NA")
        prosecution.append("NA")
        protection.append("NA")
        prevention.append("NA")
        traffick_profile.append('NA')
        pass
    
special_countries = [list(i.keys())[0].title() for i in special_data]
special_intro = [list(i.values())[0][0]['INTRO'] for i in special_data]
special_tier = [list(i.values())[0][1]['TIER'] for i in special_data]
special_government_effort = [list(i.values())[0][2]['GOVERNMENT EFFORTS'] for i in special_data]
special_traffick_profile = [list(i.values())[0][3]['TRAFFICKING PROFILE'] for i in special_data]

In [None]:
country.extend(special_countries)
intro.extend(special_intro)
tier.extend(special_tier)
recommendation.extend(["NA","NA","NA","NA"])
prosecution.extend(["NA","NA","NA","NA"])
protection.extend(["NA","NA","NA","NA"])
prevention.extend(special_government_effort)
traffick_profile.extend(special_traffick_profile)

In [None]:
tip_report = pd.DataFrame({"country":country,
              "introduction":intro,
              "tier":tier,
              "recommendation":recommendation,
              "prosecution":prosecution,
              "protection":protection,
              "prevention":prevention,
              "traffick_profile":traffick_profile})
tip_report.sort_values(by=['country'], inplace=True)
tip_report.head()

In [None]:
acled2018vio_data = acled2018_data.loc[acled2018_data["event_type"]=="Violence against civilians"]
acled2018vio_data.head()

In [None]:
week_group = acled2018vio_data[["country","iso3","event_type","location","latitude","longitude","notes","fatalities","week_day","month"]]\
                .groupby( ["country","week_day"] ).size().to_frame(name = 'count').reset_index()
week_group = week_group.pivot(index='country', columns='week_day', values='count').fillna(0)
week_group = week_group[["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]]

In [None]:
week_group.head() 

In [None]:
month_group = acled2018vio_data[["country","iso3","event_type","location","latitude","longitude","notes","fatalities","week_day","month"]]\
                .groupby( ["country","month"] ).size().to_frame(name = 'count').reset_index()
month_group = month_group.pivot(index='country', columns='month', values='count').fillna(0)
month_group = month_group[["January","February","March","April","May","June","July","August","September","October","November","December"]]
month_group.head()


In [None]:
acled2018vio_data.head()

In [None]:
listEvents = [get_grouped(event_types[0]),pop_df,asylm_df,life_exp_df,pov_df,tip_report,week_group,month_group]
complete_df = reduce(lambda x,y: pd.merge(x,y, on = "country"), listEvents)
complete_df.head()

In [None]:
def get_notes(country):
    a = acled2018vio_data.loc[acled2018vio_data["country"]==f"{country}"]
    notes = a["notes"].tolist()
    location=a["location"].tolist()
    latitude=[str(i) for i in a["latitude"].tolist()]
    longitude = [str(i) for i in a["longitude"].tolist()]
    fatalities = [str(i) for i in a["fatalities"].tolist()]
    return [" ".join(notes),"  ".join(location),"  ".join(latitude),"  ".join(longitude),"  ".join(fatalities)]

In [None]:
complete_df["notes"] = " "
complete_df["location"] = " "
complete_df["latitude"] = " "
complete_df["longitude"] = " "
complete_df["fatalities"] = " "
for index, row in complete_df.iterrows():
    try:
        complete_df.loc[index, "notes"] = get_notes(complete_df.loc[index, "country"])[0]
        complete_df.loc[index, "location"] = get_notes(complete_df.loc[index, "country"])[1]
        complete_df.loc[index, "latitude"] = get_notes(complete_df.loc[index, "country"])[2]
        complete_df.loc[index, "longitude"] = get_notes(complete_df.loc[index, "country"])[3]
        complete_df.loc[index, "fatalities"] = get_notes(complete_df.loc[index, "country"])[4]
    except:
        pass

In [None]:
complete_df = complete_df.set_index("country")
complete_df.head(3)

In [None]:
complete_dict = complete_df.T.to_dict()
b = list(complete_dict.values())
for i in range(len(b)):
    b[i]["location"] = b[i]["location"].split("  ")
    b[i]["latitude"] = b[i]["latitude"].split("  ")
    b[i]["longitude"] = b[i]["longitude"].split("  ")
    b[i]["fatalities"] = b[i]["fatalities"].split("  ")

In [None]:
new_complete_df = pd.DataFrame(complete_dict).T
new_complete_df

In [None]:
new_complete_df.to_csv("Output/Complete_dataset.csv")
new_complete_df.T.to_json("Output/Complete_dataset.json")