In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import pickle
import string
import re
import string
from sklearn.model_selection import cross_validate as cross_validation, ShuffleSplit, cross_val_score, train_test_split, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, auc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pickle

# Data Cleaning 

In [3]:
data  = pd.read_csv('/../RHoMIS_Data/Data/RHoMIS_Indicators.csv',encoding='latin1')

FileNotFoundError: [Errno 2] No such file or directory: '/../RHoMIS_Data/Data/RHoMIS_Indicators.csv'

In [None]:
data.head()

In [None]:
data.count()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
for column in data:
    print(data[column].describe())


In [None]:
negative_col = ['LandCultivated', 'LandOwned', 'currency_conversion_factor','total_income_USD_PPP_pHH_Yr','offfarm_income_USD_PPP_pHH_Yr','value_livestock_prod_consumed_USD_PPP_pHH_Yr','NrofMonthsWildFoodCons']

In [None]:
categorical_col = ['Country','HouseholdType','Head_EducationLevel', 'WorstFoodSecMonth' ,'BestFoodSecMonth','HFIAS_status']
# Head_EducationLevel specification about  possible values was not given so we omit this for now 

In [None]:
# replace negative values for features that are bounded to be positive only  as distance metrics like Land cultivated measured in hectares or Income and PPP earned 

def replace_negative(data,columns):
    for col in columns:
        data.loc[data[col] < 0] = 0
    

# Dictionary for months in different languange to english
months_to_eng = { 'ukuboza': 'dec','gashyantare' : 'feb', 'kamena' : 'jun', 'mutarama': 'jan', 'nyakanga' : 'jul' ,'nzeri' : 'sep','ukwakira' : 'oct',
                 'gicurasi' : 'may' , 'werurwe' : 'mar', 'kanama' : 'aug','ugushyingo' : 'nov' ,'mata' : 'apr'  }

def process_months(var):
    if var in months_to_eng:
            return  months_to_eng.get(var)
    else: return var    
                
translate = lambda x : process_months(x)

def process_status(var):
    if var in HFIAS_status:
        return HFIAS_status.get(var)

encode = lambda x : process_status(x)

# encode ordinal data 
HFIAS_status = {'SeverelyFI':4,'ModeratelyFI':3,'MildlyFI':2,'FoodSecure':1 }
status = ['SeverelyFI','ModeratelyFI','MildlyFI','FoodSecure']

# assigning food security level to raw scores 
def discrete_assignment(score):
    if score >= 7: return status[0]
    elif (score == 4) or (score == 5) or (score == 6): return status[1]
    elif (score == 2) or (score == 3): return status[2]
    elif (score == 0) or (score == 1): return status[3]
    else: return np.NaN 

fies_assignment = lambda x : discrete_assignment(x)


def process_scales(hfias,fies):
    if ( pd.isnull(hfias)) and (pd.isnull(fies)): return np.NaN
    elif (pd.isnull(hfias)) and (pd.notnull(fies)): return fies
    elif (pd.notnull(hfias)) and (pd.notnull(fies)): return hfias
#     hfias not missing , fies missing
    else: return hfias 

new_scale = lambda x : process_scales(x['HFIAS_status'],x['FIES_Score'])


map_educationlevel = {'primary':'primary',
                      'No_school':'no_school',
                      'secondary':'secondary',
                      'no_school':'no_school',
                      'postsecondary':'postsecondary',
                      'adulteducation':'adulteducation',
                      'illiterate':'illiterate',
                      'literate':'literate',
                      'secondary_1':'secondary',
                      'primary_2':'primary',
                       'no school': 'no_school',
                     'lower_secondary': 'secondary',
                     'secondary2':'secondary',
                      'primary_1': 'primary'}


def process_educationlevel(var):
    if var in map_educationlevel:
            return  map_educationlevel.get(var)
    else: return 'Other'    
                
    
education = lambda x : process_educationlevel(x)

In [None]:
def data_preprocessing(data):
    data_model = data.copy()
    data_model = data.drop(['ID_PROJ','ID_COUNTRY','SURVEY_ID','Region'],axis=1)
#     replace negative values with zero 
    replace_negative(data_model,negative_col)
    # replace  HFIAS status with 0 with missing value 
    data_model['HFIAS_status'] = data_model['HFIAS_status'].replace([0,'0'],np.NaN)
    # replace year 0's with missing value
    data_model['YEAR'] = data_model['YEAR'].replace([0,'0'],np.NaN)
    #replace HHmembers and HHsizemae 0's with missing value
    data_model['HHsizemembers'] = data_model['HHsizemembers'].replace(['0',0],np.NaN)
    data_model['HHsizeMAE'] = data_model['HHsizeMAE'].replace([0,'0'],np.NaN)
    # set negative values of livestock holdings to zero
    data_model.loc[data_model['LivestockHoldings'] < 0,  'LivestockHoldings'] = 0
    #replace WorstFoodSecMonth and BestFoodSecMonth with No_answer or none with  missing value 

    data_model['WorstFoodSecMonth'] = data_model['WorstFoodSecMonth'].replace(['No_answer','no_answer','None',0,'0'],np.NaN)
    data_model['BestFoodSecMonth'] = data_model['BestFoodSecMonth'].replace(['No_answer','no_answer','None',0,'0'],np.NaN)
#     replace HouseHold type with no answer to missing value 
    data_model['HouseholdType'] = data_model['HouseholdType'].replace(['no_answer',0,'0'],np.NaN)
#     replace some  head_Educationlevel responsees to missing value
    data_model['Head_EducationLevel'] = data_model['Head_EducationLevel'].replace(['No_answer','no_answer','None',0,'0'],np.NaN)
#     translate months to english
    data_model['BestFoodSecMonth'] = data_model.BestFoodSecMonth.apply(translate)
    data_model['WorstFoodSecMonth'] = data_model.WorstFoodSecMonth.apply(translate)
#     encode categorical  HFIAS status
    data_model['HFIAS_status'] = data_model.HFIAS_status.apply(process_status)
#     perform discrete assignments on the FIES scores 
    data_model['FIES_Score'] = data_model.FIES_Score.apply(fies_assignment)
#     encode categorical FIES status
    data_model['FIES_Score'] =  data_model.FIES_Score.apply(process_status)
#    create new column representing uniform food insecurity score across dataset
    data_model['Food_InsecurityLevel'] = data_model.apply(new_scale,axis=1)
    data_model = data_model.drop(['HFIAS_status','FIES_Score'],axis=1)
#     map Head_educationLevel to specific values 
    data_model['Head_EducationLevel'] = data_model['Head_EducationLevel'].apply(education)
    return data_model
    
    

    
    

In [None]:
clean_data = data_preprocessing(data)
clean_data_cp = clean_data.copy()
clean_data_cp = clean_data.drop(clean_data_cp[clean_data_cp['Country'] == 0].index)

In [None]:
#data pefore imputation of categorical features

# clean_data.to_pickle('basic_preprocessed_data.pkl')

### Impute  missing categorical data 

In [None]:
# using simple imputer 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

clean_data_cp.BestFoodSecMonth = imputer.fit_transform(clean_data_cp['BestFoodSecMonth'].values.reshape(-1,1))[:,0]

clean_data_cp.WorstFoodSecMonth = imputer.fit_transform(clean_data_cp['WorstFoodSecMonth'].values.reshape(-1,1))[:,0]

clean_data_cp.Head_EducationLevel = imputer.fit_transform(clean_data_cp['Head_EducationLevel'].values.reshape(-1,1))[:,0]

clean_data_cp.HouseholdType = imputer.fit_transform(clean_data_cp['HouseholdType'].values.reshape(-1,1))[:,0]

#  Altitude and GPS__ALT are similar so drop one 
clean_data_cp = clean_data_cp.drop(['Altitude'],axis=1)


In [None]:
# #save file before encoding 
# clean_data_cp.to_pickle('preprocessed_data.pkl')

### Encode categorical data

In [None]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder

class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

### One-Hot Encoding All Categorical data

In [None]:
encoder = OneHotEncoder()

categories = ['BestFoodSecMonth','WorstFoodSecMonth','Head_EducationLevel','HouseholdType','Country']
for i in categories:
    data_enc = encoder.fit_transform(clean_data_cp[[i]])
    clean_data_cp = clean_data_cp.join(data_enc)

clean_data_cp = clean_data_cp.drop(['BestFoodSecMonth','WorstFoodSecMonth','Head_EducationLevel','HouseholdType','Country'],axis =1)


In [None]:
# clean_data_cp.to_pickle('onehot_encoded_clean_data.pkl')

### Cyclic Encoding for Month Data and One-hot Encoding for the rest of the data

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
clean_data_cp['BestFoodSecMonth']= le.fit_transform(clean_data_cp['BestFoodSecMonth'])
clean_data_cp['WorstFoodSecMonth']= le.fit_transform(clean_data_cp['WorstFoodSecMonth'])


def cyclical_encoder(data, column):    
    months_in_year = 12
    sin_name = 'sin' + column
    cos_name = 'cos' + column
    data[sin_name] = np.sin(2*np.pi*data[column]/months_in_year)
    data[cos_name] = np.cos(2*np.pi*data[column]/months_in_year)
    data.drop(column, axis=1, inplace=True)
    data.head()

    
cyclical_encoder(clean_data_cp,'BestFoodSecMonth')
cyclical_encoder(clean_data_cp,'WorstFoodSecMonth')

categories = ['Head_EducationLevel','HouseholdType','Country']
for i in categories:
    data_enc =  pd.DataFrame(encoder.fit_transform(clean_data_cp[[i]]))
    clean_data_cp = clean_data_cp.join(data_enc)

clean_data_cp = clean_data_cp.drop(['Head_EducationLevel','HouseholdType','Country'],axis =1 )


In [None]:
plt.scatter(clean_data_cp['sinBestFoodSecMonth'], clean_data_cp['cosBestFoodSecMonth'])
plt.show()

In [None]:
clean_data_cp.to_pickle('Cyclical_encoded_data.pkl')

# Imputation

#### Columns with missing Data and count 

In [None]:
missing_data = pd.DataFrame(clean_data[clean_data.columns[clean_data.isnull().any()]].isnull().sum()/len(clean_data)*100)

In [None]:
names = []
for i in range(len(missing_data)):
        names.append(missing_data.iloc[i].name)
values = []
for i in range(len(missing_data)):
        values.append(missing_data.iloc[i][0])

data_1 = {'Features': names,'Missing Data Percentage': values}

In [4]:
#plot bar chart  of missing data 

# Dictionary loaded into a DataFrame       

df = pd.DataFrame(data=data_1)

 

# Draw a vertical bar chart

df.plot.bar(x="Features", y="Missing Data Percentage", title="Features with Missing Data",figsize=(10,6))
plt.show(block=True)

NameError: name 'data_1' is not defined

In [5]:
data_pre = pd.read_pickle('preprocessed_data.pkl')

In [1]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit_transform(data_pre)

NameError: name 'IterativeImputer' is not defined

### ListWise/Case Deletion 

In [209]:
list_wise_data = data_pre.copy()
list_wise_data.dropna(inplace = True)
percetange_observation_left = (len(list_wise_data)/ len(data_pre))*100

In [210]:
percentage_observation_left

0.24910434393193015

In [212]:
list_wise_data.head()

Unnamed: 0,YEAR,ITERATION,ID_HH,RHoMIS_ID,GPS_LAT,GPS_LON,GPS_ALT,Altitude,HHsizemembers,HHsizeMAE,...,Type_couple_polygamous,Type_couple_woman_works_away,Type_man_single,Type_nonparenthead,Type_other,Type_polygamous,Type_single,Type_together,Type_woman_single,Type_workaway
32480,2019.0,1,UG_2019_NT1_2_1,UG_2019_NT1_2_1,1.39,31.34,1173.67,1173.671341,2.0,1.33,...,0,0,0,0,0,0,0,0,1,0
32507,2019.0,1,UG_2019_NT1_29_1,UG_2019_NT1_29_1,1.38,31.34,1161.7,1161.702998,10.0,6.65,...,0,0,0,0,0,0,0,0,1,0
32512,2019.0,1,UG_2019_NT1_34_1,UG_2019_NT1_34_1,1.38,31.32,1140.2,1140.19834,10.0,6.78,...,0,0,0,0,0,0,0,0,0,0
32513,2019.0,1,UG_2019_NT1_35_1,UG_2019_NT1_35_1,1.39,31.32,1131.14,1131.135762,9.0,5.79,...,0,0,0,0,0,0,0,0,1,0
32514,2019.0,1,UG_2019_NT1_36_1,UG_2019_NT1_36_1,1.38,31.32,1121.58,1121.581565,7.0,4.85,...,0,0,0,0,0,0,0,0,0,0
