In [1]:
import numpy as np
import pandas as pd
from numpy import nan
import pickle
pd.set_option('display.max_rows', 200)

## Data Cleaning

### 1. load data

In [3]:
comprehensive = pd.read_stata('../Data/data/Comprehensive-Sample.dta')

In [4]:
comprehensive.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17920 entries, 0 to 17919
Data columns (total 80 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   year                        17920 non-null  float32
 1   numcode                     17920 non-null  int16  
 2   oilreserves_full            14307 non-null  float32
 3   oilreserves                 12519 non-null  float32
 4   oilreserves_public          9950 non-null   float32
 5   newdiscovery_aspo           12354 non-null  float32
 6   aspo                        4736 non-null   float32
 7   wildcat                     12354 non-null  float32
 8   endowment                   14160 non-null  float32
 9   pop_maddison                10441 non-null  float64
 10  ecgrowth                    9341 non-null   float32
 11  efrac                       390 non-null    float32
 12  lfrac                       390 non-null    float32
 13  rfrac                       390

In [5]:
aspo = pd.read_stata('../Data/data/ASPO-Sample.dta')

In [6]:
aspo.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5120 entries, 0 to 5119
Data columns (total 108 columns):
 #    Column                      Dtype  
---   ------                      -----  
 0    year                        float32
 1    numcode                     int16  
 2    oilreserves_full            float32
 3    oilreserves                 float32
 4    oilreserves_public          float32
 5    newdiscovery_aspo           float32
 6    imputedzero                 float32
 7    aspo                        float32
 8    wildcat                     float32
 9    endowment                   float32
 10   ecgrowth                    float32
 11   country                     object 
 12   incidence2COW               float64
 13   onset2COWCS                 float32
 14   incidenceU                  float32
 15   onsetUCS                    float32
 16   Fearon_war                  float64
 17   onset_FearonCS              float32
 18   Sambanis_war                float64
 19   onse

### 2. function to do transformation

In [7]:
# Create a sample dataframe
df = pd.DataFrame({"country": np.random.choice(["A", "B", "C"], size=20),
                   "year": np.arange(20),
                   "num1": np.random.randint(22, 60, size=20),
                   "num2": np.random.randint(20000, 90000, size=20)})

In [8]:
df = df.sort_values(by=['country','year'], ascending=True)

In [9]:
def transform_df(df, groupby_cols, log_cols, normal_cols):
    
    new_df = df.copy()
    new_cols = []
    
    for col_name in new_df.columns:
        
        if col_name in log_cols:
            lag_name = col_name + '_lagged'
            new_df[lag_name] = new_df.groupby(groupby_cols)[col_name].shift(1)
            diff_name = col_name + '_diff'
            new_df[diff_name] = new_df[col_name] / new_df[lag_name]
            new_cols.append(diff_name)
            
        elif col_name in normal_cols:
            diff_name = col_name + '_diff'
            new_df[diff_name] = new_df.groupby(groupby_cols)[col_name].pct_change()
            new_cols.append(diff_name)
        
    return new_df[new_cols]

In [10]:
df

Unnamed: 0,country,year,num1,num2
3,A,3,26,80204
5,A,5,50,64853
13,A,13,31,36506
14,A,14,46,82655
15,A,15,45,88479
0,B,0,55,68985
1,B,1,26,44382
2,B,2,53,85038
7,B,7,40,65781
9,B,9,30,76961


In [11]:
transform_df(df, ['country'], ['num1'], ['num2'])

Unnamed: 0,num1_diff,num2_diff
3,,
5,1.923077,-0.191399
13,0.62,-0.437096
14,1.483871,1.264148
15,0.978261,0.070462
0,,
1,0.472727,-0.356643
2,2.038462,0.916047
7,0.754717,-0.226452
9,0.75,0.169958


--

In [31]:
#np.NaN - 5

In [30]:
#comprehensive['code3'].unique()

In [28]:
#comprehensive.query('code3 == "USA"')['logoutreg']
#comprehensive[['year','coup']].groupby('coup').count()

In [29]:
#comprehensive[['year','mountain']].groupby('mountain').count()

### 3. comprehensive - cleaning

In [12]:
v_lst = ['onset2COWCS',
         'onsetUCS',
         'coup',
         'periregular',
         'logmilexgdpSIPRI',
         'numcode',
         'year',
         
         'logGDP_M',
         'ecgrowth',
         'logpop_M',
         'logpopdens',
         'democracy',
         'logmountain',
         'ethnic_fractionalization',
         'religion_fractionalization',
         'language_fractionalization',
         'leg_british',
         'no_transition',
         'wildcat',
         
         'logvaloilres',
         'logvaloilres_public',
         'logoilres',
         'logvaloilres_impute',
         'logoilres_impute',
         'logoutreg']

In [13]:
# variables that need transformation
transform_lst = ['logmilexgdpSIPRI', # defense burden
                'logGDP_M', # there is another variable `ecgrowth`
                'logpop_M',
                'logpopdens',
                'democracy', # democracy index
                'wildcat', # wildcat drilling ?
                'logoutreg', # out of region disaster ?
                'logvaloilres',
                'logvaloilres_public',
                'logoilres',
                'logvaloilres_impute',
                'logoilres_impute'
                ]

In [14]:
log_lst = ['logmilexgdpSIPRI', 
           'logGDP_M', 
           'logpop_M', 
           'logpopdens', 
           'logoutreg', 
           'logvaloilres', 
           'logvaloilres_public', 
           'logoilres', 
           'logvaloilres_impute', 
           'logoilres_impute']

normal_lst = ['democracy', 'wildcat']

In [15]:
comprehensive = comprehensive.sort_values(by=['numcode','year'], ascending=True)

In [16]:
new_df = transform_df(comprehensive, ['numcode'], 
                     log_lst, 
                     normal_lst)

In [17]:
comprehensive_new = pd.concat([comprehensive, new_df], axis = 1)

In [11]:
comprehensive.to_pickle("../Data/data/comprehensive_new.pkl")  

In [12]:
comprehensive_new

Unnamed: 0,year,numcode,oilreserves_full,oilreserves,oilreserves_public,newdiscovery_aspo,aspo,wildcat,endowment,pop_maddison,...,logvaloilres_diff,logvaloilres_public_diff,logoilres_impute_diff,logvaloilres_impute_diff,logGDP_M_diff,logpop_M_diff,logpopdens_diff,democracy_diff,logmilexgdpSIPRI_diff,logoutreg_diff
0,1929.0,4,,,,,,,,,...,,,,,,,,,,
1,1930.0,4,,,,,,,,,...,,,,,,,,,,0.253965
2,1931.0,4,,,,,,,,,...,,,,,,,,0.000000,,1.000000
3,1932.0,4,,,,,,,,,...,,,,,,,,0.000000,,1.000000
4,1933.0,4,,,,,,,,,...,,,,,,,,0.000000,,3.934184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17915,2004.0,894,0.0,,0.0,,,,0.0,10962.026367,...,,1.0,,,1.005738,1.001604,1.005559,0.000000,,1.035035
17916,2005.0,894,0.0,,0.0,,,,0.0,11115.380859,...,,1.0,,,1.005738,1.001494,1.005156,0.000000,,1.055380
17917,2006.0,894,0.0,,0.0,,,,0.0,11288.252930,...,,1.0,,,1.006773,1.001657,1.005698,0.000000,,0.889851
17918,2007.0,894,0.0,,0.0,,,,0.0,11477.447266,...,,1.0,,,1.006682,1.001781,1.006102,0.000000,,1.035507
