In [259]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Predicting onset of the conflict in the comming 12 months at admin2 level in Ethiopia
# Data preparation for timeseries analysis 
#### Y= cc_onset
#### X= 'fatalities', 'actor_state', 'actor_rebel_groups', 'actor_political_militias', 'actor_identity_militias', 'actor_civilians', 'actor_others', 'cc_onset', 'inflation_all', 'rainfall', 'rainfall_lag', 'temperature mean', 'btotl', 'maize_ETB_KG', 'teff_ETB_KG', 'wheat_ETB_KG', 'mean ndvi', 'fs_Crisis', 'fs_Emergency', 'fs_Famine', 'fs_Minimal', 'fs_Stressed'

#### Group variables: admin1, admin2, year, month

#### data stracture objective Yt=X(t-1)+...+X(t-12)

#### where t is a month lag 

In [260]:
url = 'https://data.kimetrica.com/dataset/4dbc3cc7-9474-49f2-bfd4-231e78401caa/resource/7423b71d-ce8c-437c-9fe6-2d9ba58d6155/download/dataset_communal_cnflict_model.csv'
df = pd.read_csv(url, index_col=0)
list(df.columns)

['admin1',
 'admin2',
 'year',
 'month',
 'fatalities',
 'cc_event_count',
 'actor_state',
 'actor_rebel_groups',
 'actor_political_militias',
 'actor_identity_militias',
 'actor_civilians',
 'actor_others',
 'cc_onset',
 'admin0',
 'inflation_all',
 'inflation_food',
 'inflation_non_food',
 'rainfall',
 'rainfall_lag',
 'temperature mean',
 'btotl',
 'maize_ETB_KG',
 'teff_ETB_KG',
 'wheat_ETB_KG',
 'mean ndvi',
 'ndvi_lag',
 'fs_Crisis',
 'fs_Emergency',
 'fs_Famine',
 'fs_Minimal',
 'fs_Stressed',
 'cluster']

In [261]:
df['date'] = pd.to_datetime(df.assign(Day=1).loc[:, ['year','month','Day']])
df.rename(columns={'cluster':'group'}, inplace=True)
df.shape

(15895, 33)

In [262]:
group = ["admin1", "admin2"] # to assign the groups for the multiple group case
df_combo = df[['date','group', 'cc_onset','fatalities',
 'actor_state',
 'actor_rebel_groups',
 'actor_political_militias',
 'actor_identity_militias',
 'actor_civilians',
 'actor_others',
 'inflation_all',
 'rainfall',
 'temperature mean',
 'btotl',
 'maize_ETB_KG',
 'teff_ETB_KG',
 'wheat_ETB_KG',
 'mean ndvi',
 'fs_Crisis',
 'fs_Emergency',
 'fs_Famine',
 'fs_Minimal',
 'fs_Stressed',]] # many vars, many groups
grouped_df = df_combo.groupby(["group"])

def lag_by_group(key, value_df):
    df = value_df.assign(group = key) # this pandas method returns a copy of the df, with group columns assigned the key value
    return (df.sort_values(by=["date"], ascending=True)
        .set_index(["date"])
        .shift(1)
               ) # the parenthesis allow you to chain methods and avoid intermediate variable assignment


In [266]:
dflist = [lag_by_group(g, grouped_df.get_group(g)) for g in grouped_df.groups.keys()]
df1=pd.concat(dflist, axis=0).reset_index()
df1


Unnamed: 0,date,group,cc_onset,fatalities,actor_state,actor_rebel_groups,actor_political_militias,actor_identity_militias,actor_civilians,actor_others,...,btotl,maize_ETB_KG,teff_ETB_KG,wheat_ETB_KG,mean ndvi,fs_Crisis,fs_Emergency,fs_Famine,fs_Minimal,fs_Stressed
0,1997-01-01,,,,,,,,,,...,,,,,,,,,,
1,1997-02-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2,1997-03-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
3,1997-04-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
4,1997-05-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
5,1997-06-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
6,1997-07-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
7,1997-08-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
8,1997-09-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
9,1997-10-01,Addis Ababa_Addis Ababa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
