In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
# check to see which dataset has additional/missing features
A = set(pd.read_csv("assets/Atts_agroconcept_survey.csv", sep=';').columns.values.tolist())
B = set(pd.read_csv("assets/Atts_nonpart_survey.csv", sep=';').columns.values.tolist())
print(A-B)
print(B-A)
del A, B

{'mainprodcution', 'co22011', 'reducpercent', 'co22018', 'co22015'}
set()


In [3]:
# load the survey data
df = pd.concat([pd.read_csv("assets/Atts_agroconcept_survey.csv", sep=';'),
                pd.read_csv("assets/Atts_nonpart_survey.csv", sep=';')])

In [4]:
# get indexs for columns
print(df.columns.get_loc('net_name1_neigh'), df.columns.get_loc('net_name10_imp'))

113 212


In [5]:
# this is old network data which can be dropped
df.drop(df.iloc[:, 113:213], inplace=True, axis=1)

In [6]:
# get indexs for columns
print(df.columns.get_loc('info_gew'), df.columns.get_loc('mainprodcution'))

124 131


In [7]:
# these columns are only relevant for conference attendees so cannot be regressed on
df.drop(df.iloc[:, 124:132], inplace=True, axis=1)

In [8]:
# I compute a new network metrics based on the new network data so this degree metric is not needed  
df.drop('network', inplace=True, axis=1)

In [9]:
# get indexs for columns
print(df.columns.get_loc('legum_imag'), df.columns.get_loc('ecodr_imag'))

71 83


In [10]:
# these _imag columns contain little data 
df.drop(df.iloc[:, 71:84], inplace=True, axis=1)

In [11]:
# there is very little data to waste so nan values must be imputed for the 4 missing lines
# columns to be imputed by different means
mean_lst=['ley', 'perm_grass', 'oth_land', 'animal_unit', 'cattle_animal_unit', 'age']

median_lst=['total_agr_land', 'workforce', 'trees', 'arable_land', 'add_agr_land', 'spec_crops',
            'dairy_cows', 'suckler_cows', 'hor_shee_goa', 'pigs_poultry', 'pigs', 'poultry', 'eco_proof',
            'educ', 'cons_general', 'percep_hail', 'percep_drought', 'percep_frost', 'percep_heavyrain',
            'percep_longrain', 'percep_heat', 'activ_dairy', 'activ_cattle', 'activ_pig', 'activ_poultry',
            'activ_arab', 'activ_special', 'activ_nonag', 'attain_ghg', 'attain_yield', 'attain_biodiv',
            'attain_soil', 'attain_incom', 'attain_acknow', 'innov_pion', 'innov_early', 'innov_thorou', 
            'innov_others', 'innov_trad', 'satisf_ldw', 'thresh_ldw', 'satisf_total', 'thresh_total', 
            'share', 'others_opinions', 'soc_impr', 'soc_inc', 'soc_env', 'soc_compinc', 'soc_compenv',
            'lott_1', 'lott_2', 'lott_3', 'GHG_goal', 'env_goal', 'biodiv_goal', 'acknow_goal', 'yield_goal',
            'income_goal']

most_frequent_lst=['farmtype', 'region']

imputer_dict = {'mean':mean_lst, 'median':median_lst, 'most_frequent':most_frequent_lst}

for key, value in imputer_dict.items():

    imputer = SimpleImputer(strategy=key, missing_values=np.nan)
    imputer = imputer.fit(df[value])
    df[value] = imputer.transform(df[value])

constant_zero_lst=['perm_crops', 'prot_cult', 'qual_landscape', 'grapes', 'vegetables', 'fruits', 'oth_spec_crops',
                  'organic', 'id_survey']
    
imputer = SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0)
imputer = imputer.fit(df[constant_zero_lst])
df[constant_zero_lst] = imputer.transform(df[constant_zero_lst])

del imputer

In [12]:
df.to_pickle('cleaned_df.pkl')

finito