# 1. Import libraries and data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [7]:
path = '/Users/timovic/Documents/GitHub/UN-Peace-Missions/Data'
df = pd.read_csv(os.path.join(path, 'un_missions.csv'))

# 2. Clean data

In [8]:
df.head()

Unnamed: 0,mission_id,mission_abbrev,mission_fullname,mission_type,mission_location,gw_country,gwno_1,gwno_2,gwno_3,gwno_4,...,version,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76
0,1,UNTSO,United Nations Truce Supervision Organization,PKO,Israel/Palestine,ISR,666,,,,...,UNPMM_1.5,,,,,,,,,
1,1,UNTSO,United Nations Truce Supervision Organization,PKO,Israel/Palestine,ISR,666,,,,...,UNPMM_1.5,,,,,,,,,
2,1,UNTSO,United Nations Truce Supervision Organization,PKO,Israel/Palestine,ISR,666,,,,...,UNPMM_1.5,,,,,,,,,
3,1,UNTSO,United Nations Truce Supervision Organization,PKO,Israel/Palestine,ISR,666,,,,...,UNPMM_1.5,,,,,,,,,
4,1,UNTSO,United Nations Truce Supervision Organization,PKO,Israel/Palestine,ISR,666,,,,...,UNPMM_1.5,,,,,,,,,


In [9]:
# Drop all columns after 'version' column
df = df.loc[:, :'version']

In [11]:
# Explore the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013 entries, 0 to 1012
Data columns (total 68 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   mission_id           1013 non-null   int64  
 1   mission_abbrev       1013 non-null   object 
 2   mission_fullname     1013 non-null   object 
 3   mission_type         1013 non-null   object 
 4   mission_location     1013 non-null   object 
 5   gw_country           1013 non-null   object 
 6   gwno_1               1013 non-null   int64  
 7   gwno_2               391 non-null    float64
 8   gwno_3               24 non-null     float64
 9   gwno_4               16 non-null     float64
 10  gwno_5               16 non-null     float64
 11  mission_regionclass  1013 non-null   object 
 12  region_code          1013 non-null   int64  
 13  ucdp_link            1013 non-null   int64  
 14  ucdpconflict_id1     899 non-null    float64
 15  ucdpconflict_id2     101 non-null    f

In [12]:
# Check for missing values
df.isnull().sum()

mission_id            0
mission_abbrev        0
mission_fullname      0
mission_type          0
mission_location      0
                   ... 
max18               162
ocat01              162
mission_calc        222
mission_class       222
version               0
Length: 68, dtype: int64

In [13]:
# Check for duplicates
df.duplicated().sum()

0

In [15]:
# Check for duplicates
df_dups = df[df.duplicated()]
df_dups.shape

(0, 68)

In [16]:
# Check for mixed-type columns
for col in df.columns.tolist():
      weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (df[weird]) > 0:
        print (col)

mission_startdate
mission_enddate
mission_class


In [17]:
# Display the first 5 rows of the 'mission_startdate', 'mission_enddate', 'mission_class' columns
df[['mission_startdate', 'mission_enddate', 'mission_class']].head()

Unnamed: 0,mission_startdate,mission_enddate,mission_class
0,29.05.1948,31.12.2020,
1,29.05.1948,31.12.2020,
2,29.05.1948,31.12.2020,
3,29.05.1948,31.12.2020,
4,29.05.1948,31.12.2020,


In [19]:
# Display the data types of the 'mission_startdate', 'mission_enddate', 'mission_class' columns
df[['mission_startdate', 'mission_enddate', 'mission_class']].dtypes

mission_startdate    object
mission_enddate      object
mission_class        object
dtype: object

In [21]:
# Convert the 'mission_startdate' and 'mission_enddate' columns to datetime using infer_datetime_format
df['mission_startdate'] = pd.to_datetime(df['mission_startdate'], infer_datetime_format=True)
df['mission_enddate'] = pd.to_datetime(df['mission_enddate'], infer_datetime_format=True)

# 3. Explore data

In [27]:
df.corr()

Unnamed: 0,mission_id,gwno_1,gwno_2,gwno_3,gwno_4,gwno_5,region_code,ucdp_link,ucdpconflict_id1,ucdpconflict_id2,...,max11,max12,max13,max14,max15,max16,max17,max18,ocat01,mission_calc
mission_id,1.0,0.023791,0.0428,0.628655,1.0,1.0,0.359009,-0.062512,0.254583,0.504767,...,0.054757,0.094646,0.14518,0.106946,0.092352,0.077551,0.065729,0.167696,0.053798,0.286683
gwno_1,0.023791,1.0,0.868495,0.997974,1.0,1.0,-0.235911,0.278386,0.087643,0.346834,...,0.010464,-0.132267,-0.031904,0.017413,-0.117717,-0.062487,0.027326,-0.029146,-0.023807,-0.242209
gwno_2,0.0428,0.868495,1.0,1.0,1.0,1.0,-0.218725,0.599733,-0.021136,0.254651,...,-0.03189,-0.104849,,0.109189,0.105556,0.064166,0.065831,0.092145,0.048307,0.183417
gwno_3,0.628655,0.997974,1.0,1.0,1.0,1.0,-0.962126,-0.772761,1.0,,...,,,,,,,,,,1.0
gwno_4,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,,,...,,,,,,,,,,
gwno_5,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,,,...,,,,,,,,,,
region_code,0.359009,-0.235911,-0.218725,-0.962126,-1.0,-1.0,1.0,-0.116046,0.150128,0.590745,...,0.036345,0.134789,0.100309,0.089431,0.164783,0.096467,0.018037,0.120615,0.089776,0.371068
ucdp_link,-0.062512,0.278386,0.599733,-0.772761,-1.0,-1.0,-0.116046,1.0,0.011257,,...,0.043148,0.051238,0.043148,0.04459,0.037544,0.055927,0.040121,0.035336,0.052475,0.06113
ucdpconflict_id1,0.254583,0.087643,-0.021136,1.0,,,0.150128,0.011257,1.0,0.35516,...,0.067152,0.007957,0.020361,0.064541,0.013181,0.01298,0.025441,0.034939,0.046174,0.056844
ucdpconflict_id2,0.504767,0.346834,0.254651,,,,0.590745,,0.35516,1.0,...,-0.075191,0.01187,0.129262,0.01187,0.017938,-0.06689,0.038842,0.016879,-0.037753,-0.101841
