In [18]:
import re
import string
import string
from pathlib import Path
import math
import pickle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer


# data = pd.read_csv('../RHoMIS_ADS_Project_2021/Data/RHoMIS_Indicators.csv',encoding='latin1')
data  = pd.read_pickle('./preprocessed_data.pkl')

# Data inspection ---------------------------------------------------------------------------
def inspect_data(data):
	print(data.head())
	print(data.count())
	print(data.shape)
	print(data.info())
	for column in data:
	    print(data[column].describe())

#inspect_data(data)

# Data Wrangling --------------------------------------------------------------------------------


# replace negative values for features that are bounded to be positive only  as distance metrics like Land cultivated measured in hectares or Income and PPP earned 

def replace_negative(data,columns):
    for col in columns:
        data.loc[data[col] < 0] = 0
    


def drop_columns(data):
	negative_col = ['LandCultivated', 'LandOwned', 'currency_conversion_factor','total_income_USD_PPP_pHH_Yr','offfarm_income_USD_PPP_pHH_Yr','value_livestock_prod_consumed_USD_PPP_pHH_Yr','NrofMonthsWildFoodCons']

	categorical_col = ['Country','HouseholdType','Head_EducationLevel', 'WorstFoodSecMonth' ,'BestFoodSecMonth','HFIAS_status']
	# Head_EducationLevel specification about  possible values was not given so we omit this for now 


	replace_negative(data, negative_col)

	data_model = data.copy()
# 	data_model = data.drop(['ID_PROJ','ID_COUNTRY','SURVEY_ID','Region'],axis=1)
	data_model.drop(['Head_EducationLevel'],axis=1)
	data_model.set_index('RHoMIS_ID')
	return data_model



"""
# Print objects
for col in data:
    if data[col].dtype == object:
        print(col)
"""


def replace_missing_with_nan(data_model):
	# replace  HFIAS status with 0 with missing value 
	data_model.loc[data_model['HFIAS_status'] == 0] = np.NaN


	#replace WorstFoodSecMonth and BestFoodSecMonth with No_answer or none with  missing value 
	data_model.loc[data_model['WorstFoodSecMonth'] == 'No_answer'] = np.NaN
	data_model.loc[data_model['WorstFoodSecMonth'] == 'None'] = np.NaN
	data_model.loc[data_model['WorstFoodSecMonth'] == 'no_answer'] = np.NaN
	data_model.loc[data_model['BestFoodSecMonth'] == 'No_answer'] = np.NaN
	data_model.loc[data_model['BestFoodSecMonth'] == 'no_answer'] = np.NaN
	data_model.loc[data_model['BestFoodSecMonth'] == 'None'] = np.NaN

	#replace HouseHold type with no answer to missing value 
# 	data_model.loc[data_model['HouseholdType'] == 'no_answer'] = np.NaN
# 	data_model.loc[data_model['HouseholdType'] == '0'] = np.NaN

	return data_model



# Dictionary for months in different languange to english


def process_months(var, translations):
    if var in translations:
            return  translations.get(var)
    else: return var    
                


def translate_words(data_model):
	months_to_eng = {'ukuboza': 'dec', 'gashyantare' : 'feb', 'kamena' : 'jun', 'mutarama': 'jan', 'nyakanga' : 'jul' , 'nzeri' : 'sep', 'ukwakira' : 'oct',
                 'gicurasi' : 'may' , 'werurwe' : 'mar', 'kanama' : 'aug', 'ugushyingo' : 'nov' , 'mata' : 'apr'}

	translate = lambda x : process_months(x, months_to_eng)
	data_model['BestFoodSecMonth'] = data_model.BestFoodSecMonth.apply(translate)
	data_model['WorstFoodSecMonth'] = data_model.WorstFoodSecMonth.apply(translate)
	return data_model




def hfias_status_vis(data_model):
	HFIAS_status_count = data_model['HFIAS_status'].value_counts()
	sns.set(style="darkgrid")
	sns.barplot(x = HFIAS_status_count.index, y = HFIAS_status_count.values, alpha=0.9)
	plt.title('Frequency Distribution of HFIAS_status')
	plt.ylabel('Number of Occurrences', fontsize=12)
	plt.xlabel('HFIAS_status', fontsize=12)
	plt.show()


"""
# encode ordinal data 
HFIAS_status = {'SeverelyFI':0,'ModeratelyFI':1,'MildlyFI':2,'FoodSecure':3 }           
data_model['HFIAS_status'] = data_model.HFIAS_status.apply(process_status)

data_model['HFIAS_status'].value_counts()
"""

#print(data_model['Country'].value_counts())

data_model = drop_columns(data)
data_model = replace_missing_with_nan(data_model)
data_model = translate_words(data_model)

# using one got encoding to encode categorical  data , country and  household type 
# enc_data = pd.get_dummies(data_model, prefix=['Nat','Type'], columns=['Country','HouseholdType'])

# pickle the data to be used 
enc_data.to_pickle('preprocessed_data.pkl')
#print(enc_data)


# Visualizations
#hfias_status_vis(data_model)

# Imputation ---------------------------------------------------------------------------------

# Graph displaying amount of missing data for each featurei

def missing_data_vis():
	missing_data = pd.DataFrame(data[data.columns[data.isnull().any()]].isnull().sum()/len(data)*100)

	names = []
	for i in range(len(missing_data)):
		names.append(missing_data.iloc[i].name)
	values = []
	for i in range(len(missing_data)):
		values.append(missing_data.iloc[i][0])

	data_1 = {'Features': names,'Missing Data Percentage': values}

	# Dictionary loaded into a DataFrame       
	df = pd.DataFrame(data=data_1)
	df.plot.bar(x="Features", y="Missing Data Percentage", title="Features with Missing Data",figsize=(10,6))
	plt.show(block=True)


#missing_data_vis()

In [19]:
def mice_imputer(df):
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(df)
    imp.transform(df)
#    

print(enc_data)
print(mice_imputer(enc_data))

         YEAR  ITERATION              ID_HH          RHoMIS_ID  GPS_LAT  \
0      2019.0        1.0    BF_2019_ADN_1_1    BF_2019_ADN_1_1    11.19   
1      2019.0        1.0    BF_2019_ADN_2_1    BF_2019_ADN_2_1    11.19   
2      2019.0        1.0    BF_2019_ADN_3_1    BF_2019_ADN_3_1    11.19   
3      2019.0        1.0    BF_2019_ADN_4_1    BF_2019_ADN_4_1    11.19   
4      2019.0        1.0    BF_2019_ADN_5_1    BF_2019_ADN_5_1    11.22   
...       ...        ...                ...                ...      ...   
35723  2017.0        1.0  ZM_2017_SCN_606_1  ZM_2017_SCN_606_1   -13.82   
35724  2017.0        1.0  ZM_2017_SCN_607_1  ZM_2017_SCN_607_1   -13.82   
35725  2017.0        1.0  ZM_2017_SCN_608_1  ZM_2017_SCN_608_1   -13.82   
35726  2017.0        1.0  ZM_2017_SCN_609_1  ZM_2017_SCN_609_1   -13.82   
35727  2017.0        1.0  ZM_2017_SCN_610_1  ZM_2017_SCN_610_1   -13.82   

       GPS_LON  GPS_ALT     Altitude  HHsizemembers  HHsizeMAE  ...  \
0        -0.95   254.00   25

ValueError: could not convert string to float: 'BF_2019_ADN_1_1'