# Double ML - data wrangling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1. Data Wrangling

In [2]:

df = pd.read_csv('../../data/merged_052423.csv')

In [3]:
df.columns

Index(['country', 'sect', 't', 'source', 'outp', 'outpd', 'outptt', 'gvco',
       'gvcobp', 'gvcofp', 'gvcomix', 'gvcobp%', 'gvcofp%', 'gvcomix%',
       'gvcobp_diff', 'gvcofp_diff', 'gvcomix_diff', 'Unnamed: 0',
       'onset2COWCS', 'd2incidenceU', 'd3_6incidenceU', 'onsetUCS', 'coup',
       'periregular', 'milexp_pergdpSIPRI', 'decade', 'ecgrowth', 'logpop_M',
       'logpopdens', 'logoutreg', 'democracy', 'logmountain',
       'ethnic_fractionalization', 'religion_fractionalization',
       'language_fractionalization', 'leg_british', 'opec',
       'milexp_pergdpSIPRI_diff', 'logpop_M_diff', 'logpopdens_diff',
       'logoutreg_diff', 'ecgrowth_demeaned'],
      dtype='object')

In [4]:
df = df[['country', 'sect', 't', 
         
         'gvcobp_diff', 'gvcofp_diff', 'gvcomix_diff', 

         'onset2COWCS',
         
         'decade', 'democracy', 'logmountain','ethnic_fractionalization', 'religion_fractionalization', 'language_fractionalization', 'leg_british','opec', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned']]

In [5]:
df_mix = df.drop(['gvcobp_diff', 'gvcofp_diff'], axis=1)
df_bp = df.drop(['gvcofp_diff', 'gvcomix_diff'], axis=1)
df_fp = df.drop(['gvcobp_diff', 'gvcomix_diff'], axis=1)

### 1.1 df_mix

In [6]:
# Pivot the table using the 'sect' column as the new columns and 'gvcomix_diff' as the values
pivoted_df = df_mix.pivot_table(values='gvcomix_diff', index=['country', 't'], columns='sect', aggfunc='first').reset_index()

# Merge the original DataFrame with the pivoted DataFrame on the common columns
merged_df = pd.merge(df_mix.drop(['sect','gvcomix_diff'], axis=1).drop_duplicates(), pivoted_df, on=['country', 't'])

df_mix_new = merged_df

In [7]:
df_mix_new = df_mix_new.rename(columns={19: 'iv_transport',
                                        1: 'treat_agri',
                                        2: 'treat_mine',
                                        6: 'treat_fuel',
                                        10: 'treat_metal'})

In [8]:
# calculate the mean value of each column for each year
mean_data = df_mix_new.groupby('t')[['treat_agri', 'treat_mine', 'treat_fuel', 'treat_metal']].mean().reset_index()
mean_data = mean_data.rename(columns={'treat_agri': 'iv_agri', 'treat_mine': 'iv_mine', 
                                      'treat_fuel': 'iv_fuel', 'treat_metal': 'iv_metal'})

# merge the original panel data with the mean data on the "year" column
df_mix_new = pd.merge(df_mix_new, mean_data, on='t')

In [9]:
# Define categorization function
def categorize_value(value, q1_3, q2_3):
    if value > q2_3:
        return 1
    elif value < q1_3:
        return 0
    else:
        return np.nan

# Columns to apply the transformation
columns = ['treat_agri', 'treat_mine', 'treat_fuel', 'treat_metal', 
           'iv_transport', 
           'iv_agri', 'iv_mine', 'iv_fuel', 'iv_metal']

# Iterate through the columns and apply the categorization function
for col in columns:
    q1_3 = df_mix_new[col].quantile(1/3)
    q2_3 = df_mix_new[col].quantile(2/3)
    
    df_mix_new[col] = df_mix_new[col].apply(lambda x: categorize_value(x, q1_3, q2_3))

In [10]:
df_mix_new.to_csv('df_mix.csv', index=False)

### 1.2 df_bp

In [11]:
# Pivot the table using the 'sect' column as the new columns and 'gvcomix_diff' as the values
pivoted_df = df_bp.pivot_table(values='gvcobp_diff', index=['country', 't'], columns='sect', aggfunc='first').reset_index()

# Merge the original DataFrame with the pivoted DataFrame on the common columns
merged_df = pd.merge(df_bp.drop(['sect','gvcobp_diff'], axis=1).drop_duplicates(), pivoted_df, on=['country', 't'])

df_bp_new = merged_df

In [12]:
df_bp_new = df_bp_new.rename(columns={19: 'iv_transport',
                                        1: 'treat_agri',
                                        2: 'treat_mine',
                                        6: 'treat_fuel',
                                        10: 'treat_metal'})

In [13]:
# calculate the mean value of each column for each year
mean_data = df_bp_new.groupby('t')[['treat_agri', 'treat_mine', 'treat_fuel', 'treat_metal']].mean().reset_index()
mean_data = mean_data.rename(columns={'treat_agri': 'iv_agri', 'treat_mine': 'iv_mine', 
                                      'treat_fuel': 'iv_fuel', 'treat_metal': 'iv_metal'})

# merge the original panel data with the mean data on the "year" column
df_bp_new = pd.merge(df_bp_new, mean_data, on='t')

In [14]:
# Columns to apply the transformation
columns = ['treat_agri', 'treat_mine', 'treat_fuel', 'treat_metal', 
           'iv_transport', 
           'iv_agri', 'iv_mine', 'iv_fuel', 'iv_metal']

# Iterate through the columns and apply the categorization function
for col in columns:
    q1_3 = df_bp_new[col].quantile(1/3)
    q2_3 = df_bp_new[col].quantile(2/3)
    
    df_bp_new[col] = df_bp_new[col].apply(lambda x: categorize_value(x, q1_3, q2_3))

In [15]:
df_bp_new.to_csv('df_bp.csv', index=False)

### 1.3 df_fp

In [16]:
# Pivot the table using the 'sect' column as the new columns and 'gvcomix_diff' as the values
pivoted_df = df_fp.pivot_table(values='gvcofp_diff', index=['country', 't'], columns='sect', aggfunc='first').reset_index()

# Merge the original DataFrame with the pivoted DataFrame on the common columns
merged_df = pd.merge(df_fp.drop(['sect','gvcofp_diff'], axis=1).drop_duplicates(), pivoted_df, on=['country', 't'])

df_fp_new = merged_df

In [17]:
df_fp_new = df_fp_new.rename(columns={19: 'iv_transport',
                                        1: 'treat_agri',
                                        2: 'treat_mine',
                                        6: 'treat_fuel',
                                        10: 'treat_metal'})

In [18]:
# calculate the mean value of each column for each year
mean_data = df_fp_new.groupby('t')[['treat_agri', 'treat_mine', 'treat_fuel', 'treat_metal']].mean().reset_index()
mean_data = mean_data.rename(columns={'treat_agri': 'iv_agri', 'treat_mine': 'iv_mine', 
                                      'treat_fuel': 'iv_fuel', 'treat_metal': 'iv_metal'})

# merge the original panel data with the mean data on the "year" column
df_fp_new = pd.merge(df_fp_new, mean_data, on='t')

In [19]:
# Columns to apply the transformation
columns = ['treat_agri', 'treat_mine', 'treat_fuel', 'treat_metal', 
           'iv_transport', 
           'iv_agri', 'iv_mine', 'iv_fuel', 'iv_metal']

# Iterate through the columns and apply the categorization function
for col in columns:
    q1_3 = df_fp_new[col].quantile(1/3)
    q2_3 = df_fp_new[col].quantile(2/3)
    
    df_fp_new[col] = df_fp_new[col].apply(lambda x: categorize_value(x, q1_3, q2_3))

In [20]:
df_fp_new.to_csv('df_fp.csv', index=False)