## Training and Testing data for analysis
Here we generate the training and testing data for analysis. 
We will generate the standardized training data, resampled training data and standardized testing data 


_basic feature engineering_

here we create new variables: 

*FCSStaus_lag*: regional average food insecure proportion for UNHS 2016, 1146 obs with NA because the district level mismatch. 

*FoodInsecureMonthly_lag*: monthly average food insecure proportion for UNHS 2016. 

Other discrete variables are coded as frequency or dummy variables. We also added some log transformation variables for skewed distributed variables. 

_imputation_

we use mode imputation to keep the number of observations large and to avoid some errors in feature engineering problems. This approach has advantage that we keep as much as information we can in our analysis, but with a disadvantage that we might introduced some noise. 


In [None]:
import pandas as pd
import numpy as np
import pickle 

pd.set_option('future.no_silent_downcasting', True)

data2019 = pd.read_csv("./bld/datasets/data2019.csv")
data2016 = pd.read_csv("./bld/datasets/data2016.csv")

# imputation for NA, with mode  
individual = ['HouseType', 'RoofType', 'WaterSource', 
       'DistDrinkingWater', 'ShareToilet', 'Income', 'Kind.Income.Ratio', 'Salt', 'SubjectivePoverty', 'RelLivStandard',
       'IncomeStab', 'LivStandChange','MaleRatio', 'AvgAge', 'SelfArg', 'SelfHerd',
       'OwnNow_ArgLand', 'valueNow_ArgLand', 'Own1yrAgo_ArgLand',
       'ValueAgo_ArgLand', 'OwnNow_TV', 'valueNow_TV', 'Own1yrAgo_TV',
       'ValueAgo_TV', 'OwnNow_FixPhone', 'valueNow_FixPhone',
       'Own1yrAgo_FixPhone', 'ValueAgo_FixPhone', 'OwnNow_MobilePhone',
       'valueNow_MobilePhone', 'Own1yrAgo_MobilePhone', 'ValueAgo_MobilePhone','OwnNow_Refrigerator', 'valueNow_Refrigerator',
       'Own1yrAgo_Refrigerator', 'ValueAgo_Refrigerator', 'OwnNow_Furniture',
       'valueNow_Furniture', 'Own1yrAgo_Furniture', 'ValueAgo_Furniture','OwnNow_Cooker', 'valueNow_Cooker', 'Own1yrAgo_Cooker',
       'ValueAgo_Cooker', 'OwnNow_Livestock', 'valueNow_Livestock','Own1yrAgo_Livestock', 'ValueAgo_Livestock','valueNowTotal',
       'ValueAgoTotal','FamilySize','SelfStapleTypes'] # ShareToilet and Salt are dummies 

for i in individual: 
#     data2019[i].fillna(data2019[i].mode()[0], inplace=True)
    data2019.fillna({i: data2019[i].mode()[0]}, inplace=True)


# generate lagged variables of monthly mean FCS
data2016.rename(columns={'mean_FCS':'FoodInsecureMonthly_lag'}, inplace=True)
unique_month_mean_FCS = data2016[['month', 'FoodInsecureMonthly_lag']].drop_duplicates()
data2019 = pd.merge(data2019, unique_month_mean_FCS, on='month', how='left', suffixes=('', '_unique'))
# create a lagged regional mean FCS
data2016['dismerge'] = data2016['s1aq2a'].str.upper()
mg2016 = data2016[['FCSStaus', 'dismerge']].groupby('dismerge').mean().reset_index()
mg2016.rename(columns={'dismerge':'s1aq2a', 'FCSStaus':'FCSStaus_lag'}, inplace=True)
data2019['FCSStaus_lag'] = pd.merge(data2019, mg2016, on='s1aq2a', how='left')['FCSStaus_lag']
# data2019['FCSStaus_lag'].fillna(data2019['FCSStaus_lag'].mode()[0], inplace=True) # impute with mode
data2019.fillna({'FCSStaus_lag': data2019['FCSStaus_lag'].mode()[0]}, inplace=True) # impute with mode

wkd2019 = pd.get_dummies(data2019,columns=["ShareToilet","Salt"])
wkd2019_2 = pd.get_dummies(wkd2019,columns=["IncomeStab",'SubjectivePoverty', 'RelLivStandard','LivStandChange'])

# frequency encoding 
freq = ['HouseType', 'RoofType', 'WaterSource']
wkd2019_2['DistDrinkingWaterBig3'] = wkd2019_2['DistDrinkingWater'].replace({'0-3': 0, "3-5": 1, "5-8": 1, '8 or more KMs': 1})
wkd2019_2['DistDrinkingWaterBig3'] = wkd2019_2['DistDrinkingWaterBig3'].infer_objects(copy=False)

for j in freq: 
    freqf = wkd2019_2.groupby(j).size()/len(wkd2019_2)
    wkd2019_2[f"{j}_feq"] = wkd2019_2[j].apply(lambda x : freqf[x])
    
wkd2019_2['fatalitiesMean'] = wkd2019_2[['fatalities', 'fatalities.lag1', 'fatalities.lag2', 'fatalities.lag3',
       'fatalities.lag4']].sum(axis=1)
wkd2019_2['temperatureMean'] = wkd2019_2[['temperature', 'temperature.lag1',
       'temperature.lag2', 'temperature.lag3', 'temperature.lag4']].mean(axis=1)
wkd2019_2['precipitationMean'] = wkd2019_2[['precipitation', 'precipitation.lag1', 'precipitation.lag2',
       'precipitation.lag3', 'precipitation.lag4']].mean(axis=1)
wkd2019_2['NDVIMean'] = wkd2019_2[['NDVI', 'NDVI.lag1', 'NDVI.lag2', 'NDVI.lag3', 'NDVI.lag4']].mean(axis=1)
wkd2019_2['NDVI.Anomaly.Mean'] = wkd2019_2[['NDVI.Anomaly',
       'NDVI.Anomaly.lag1', 'NDVI.Anomaly.lag2', 'NDVI.Anomaly.lag3',
       'NDVI.Anomaly.lag4']].mean(axis=1)

# feature engineering for log transformation
value_list = ['valueNow_ArgLand', "Income", 
       'valueNow_TV', 'valueNow_FixPhone',  'valueNow_MobilePhone', 'valueNow_Refrigerator', 'valueNow_Furniture', 
       'valueNow_Cooker', 'valueNow_Livestock', 'valueNowTotal', 'ValueAgoTotal']
value_list_new = [f'{i}_new' for i in value_list]
wkd2019_2[value_list_new] = np.log(wkd2019_2[value_list] + 1)*10
wkd2019_2['FamilySize_new'] =  np.log(wkd2019_2['FamilySize'])*10
wkd2019_2['SelfStapleTypes_new'] =  np.log(wkd2019_2['SelfStapleTypes'] + 1)*10

macrologList = ['NL_District', 'NL_County', 'fatalitiesMean', 'temperatureMean',
       'precipitationMean', 'NDVIMean', 'NDVI.Anomaly.Mean', 'Kind.Income.Ratio', 
       'Average..mm.', 'X1.Month.Anomaly....', 'X3.Months.Anomaly....','MaleRatio', 'AvgAge' ]
macrologList_new = [f'{i}_log' for i in macrologList]
wkd2019_2[macrologList_new] = np.log(wkd2019_2[macrologList]+1)*10

time_splitted_data_1920 = { }
for y in [2019, 2020]:
    for m in range(1,13):
        if wkd2019_2.query("year == @y and month == @m").shape[0] != 0: 
            time_splitted_data_1920[f"{y}_{m}"] = wkd2019_2.query("year == @y and month == @m").reset_index(drop = True)
        else: 
            continue 
        
#  FoodInsecureMonthly_lag
predictorList = ['FCSStaus_lag', 'urban','NL_District_log', 'FoodInsecureMonthly_lag', 
       'precipitationMean', 'NDVI.Anomaly.Mean',
       'Average..mm.',  'X1.Month.Anomaly....', 'X3.Months.Anomaly....', 
       'fatalitiesMean_log','temperatureMean_log','NDVIMean_log',
       'Kind.Income.Ratio','MaleRatio', 'AvgAge', 'SelfArg', 'SelfHerd', 'ShareToilet_Yes', 'Salt_Yes',
       'HouseType_feq', 'RoofType_feq', 'WaterSource_feq', 'IncomeStab_Somewhat stable',
       'IncomeStab_Very unstable',
       'SubjectivePoverty_Neither poor nor rich', 'SubjectivePoverty_Poor',
       'SubjectivePoverty_Very poor', 'RelLivStandard_Better off',
       'RelLivStandard_Same', 'RelLivStandard_Worse off',
       'LivStandChange_Decreased', 'LivStandChange_Increased',
       'LivStandChange_Stayed at the same', 'DistDrinkingWaterBig3', 'FamilySize', 
       'SelfStapleTypes', 'valueNow_MobilePhone', 'valueNowTotal', 'valueNow_Furniture', 
        'valueNow_MobilePhone_new', 'valueNow_ArgLand_new', 'valueNow_ArgLand', 'valueNow_Livestock', 
       'valueNow_Furniture_new', 'valueNow_Livestock_new', 'valueNowTotal_new', 'Income_new', 'Income', 
       'ValueAgoTotal_new', 'valueNow_FixPhone',  
       'valueNow_Refrigerator']

binaryList = ['urban','Salt_Yes','ShareToilet_Yes','SelfArg', 'SelfHerd', 'IncomeStab_Somewhat stable',
       'IncomeStab_Very unstable','SubjectivePoverty_Neither poor nor rich', 'SubjectivePoverty_Poor',
       'SubjectivePoverty_Very poor', 'RelLivStandard_Better off',
       'RelLivStandard_Same', 'RelLivStandard_Worse off',
       'LivStandChange_Decreased', 'LivStandChange_Increased', 
       'LivStandChange_Stayed at the same','DistDrinkingWaterBig3']

freqList = [f"{i}_feq" for i in freq]
nonstdList = []

standardizationList = list(set(predictorList) - set(binaryList) - set(freqList) - set(nonstdList))

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
import shap
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import  make_scorer
import matplotlib.pyplot as plt
from sklearn import metrics
from python_functions.modues import StandardizerTrainTest, resampling, standardiza_resample_data_reindex
from python_functions.modues import generate_traintest_data_by_time, standardiza_resample_data # here put train test splitting function 
from sklearn.utils import resample

# create splitted train and test base datasets separately by time 
# _non means the data is not splitted by region, just like the original data
# _district means the data is splitted by district, for the training, the district is not 2 and covid is 0, 
# for the testing, the district is 2
trainData_non, testData_non = generate_traintest_data_by_time(time_splitted_data_1920, None)
trainData_district, testData_district = generate_traintest_data_by_time(time_splitted_data_1920, 'District_covid')
trainData_county, testData_county = generate_traintest_data_by_time(time_splitted_data_1920, 'county_covid')
trainData_subcounty, testData_subcounty = generate_traintest_data_by_time(time_splitted_data_1920, 'subcounty_covid')
# Filter non-zero rows for testData and keep specific trainData
def filter_non_zero_rows(data_dict):
    filtered_data = {}
    for key, df in data_dict.items():
        if 'test' in key:
            num_rows = df.shape[0]
            if num_rows != 0:
                filtered_data[key] = df
            else:
                continue
        elif key in ['train_2', 'train_3', 'train_4', 'train_5', 'train_10']:
            filtered_data[key] = df
    return filtered_data

testData_district_filtered = filter_non_zero_rows(testData_district)
testData_county_filtered = filter_non_zero_rows(testData_county)
testData_subcounty_filtered = filter_non_zero_rows(testData_subcounty)

trainData_district_filtered = filter_non_zero_rows(trainData_district)
trainData_county_filtered = filter_non_zero_rows(trainData_county)
trainData_subcounty_filtered = filter_non_zero_rows(trainData_subcounty)



_Train test split_

Here *StdTrain_district* contains all pre-Covid data splitted and add up by month, they only contains which District_covid != 2 and covid == 0 households, to avoid data leakage. 

*StdTest_district* contains all during-Covid monthly and total testing data, they only contrains which District_covid == 2 and covid == 1 households. 

Other datasets follow the same rules. and _non data sets are just as same as the origin datasets. 

Only difference between *StdTrain_non* and *StdTrain_region* is that: *StdTrain_non* contains the updated training data during Covid. 

Only difference between *StdTest_non* and *StdTest_region* is that: *StdTest_non* contains the overlapped regions before and during Covid. 

In [None]:
# Generate time series data for different region levels: during COVID data 
StdTrain_non, StdTest_non, SMOTE_Train_non, ADASYN_Train_non, SMOTETOM_Train_non, SMOTEENN_Train_non = standardiza_resample_data(trainData_non, testData_non, predictorList, standardizationList, binaryList, freqList, nonstdList)
StdTrain_district, StdTest_district, SMOTE_Train_district, ADASYN_Train_district, SMOTETOM_Train_district, SMOTEENN_Train_district = standardiza_resample_data(trainData_district_filtered, testData_district_filtered,  predictorList, standardizationList, binaryList, freqList, nonstdList)
StdTrain_county, StdTest_county, SMOTE_Train_county, ADASYN_Train_county, SMOTETOM_Train_county, SMOTEENN_Train_county = standardiza_resample_data(trainData_county_filtered, testData_county_filtered, predictorList, standardizationList, binaryList, freqList, nonstdList)
StdTrain_subcounty, StdTest_subcounty, SMOTE_Train_subcounty, ADASYN_Train_subcounty, SMOTETOM_Train_subcounty, SMOTEENN_Train_subcounty = standardiza_resample_data(trainData_subcounty_filtered, testData_subcounty_filtered, predictorList, standardizationList, binaryList, freqList, nonstdList)

In [None]:
data_during_district = {'StdTrain_district': StdTrain_district, 'StdTest_district': StdTest_district, 'SMOTE_Train_district': SMOTE_Train_district, 'ADASYN_Train_district': ADASYN_Train_district, 'SMOTETOM_Train_district': SMOTETOM_Train_district, 'SMOTEENN_Train_district': SMOTEENN_Train_district}
data_during_county = {'StdTrain_county': StdTrain_county, 'StdTest_county': StdTest_county, 'SMOTE_Train_county': SMOTE_Train_county, 'ADASYN_Train_county': ADASYN_Train_county, 'SMOTETOM_Train_county': SMOTETOM_Train_county, 'SMOTEENN_Train_county': SMOTEENN_Train_county}
data_during_subcounty = {'StdTrain_subcounty': StdTrain_subcounty, 'StdTest_subcounty': StdTest_subcounty, 'SMOTE_Train_subcounty': SMOTE_Train_subcounty, 'ADASYN_Train_subcounty': ADASYN_Train_subcounty, 'SMOTETOM_Train_subcounty': SMOTETOM_Train_subcounty, 'SMOTEENN_Train_subcounty': SMOTEENN_Train_subcounty}

# Define the output directory
output_dir = 'bld/datasets/generated/'

# Store each dictionary as a pickle file
with open(f'{output_dir}data_during_district.pkl', 'wb') as f:
    pickle.dump(data_during_district, f)
with open(f'{output_dir}data_during_county.pkl', 'wb') as f:
    pickle.dump(data_during_county, f)
with open(f'{output_dir}data_during_subcounty.pkl', 'wb') as f:
    pickle.dump(data_during_subcounty, f)

In [None]:
from sklearn.model_selection import train_test_split

# Extract unique counties
unique_counties = trainData_county_filtered['train_10']['County'].unique()

# Split the counties into train and test sets
train_counties, test_counties = train_test_split(unique_counties, test_size=0.1, random_state=42)

# Create train and test sets based on the split counties
train_data_county = trainData_county_filtered['train_10'][trainData_county_filtered['train_10']['County'].isin(train_counties)]
test_data_county = trainData_county_filtered['train_10'][trainData_county_filtered['train_10']['County'].isin(test_counties)]

trainData_county_before = {'train_10': train_data_county[predictorList + ['FCSStaus', 'covid', 'District_covid', 'county_covid', 'subcounty_covid']]}
testData_county_before = {'test_10': test_data_county[ predictorList + ['FCSStaus', 'covid', 'District_covid', 'county_covid', 'subcounty_covid']]}

overlap = set(train_counties) & set(test_counties)
if overlap:
    print("Overlapping elements found:", overlap)
else:
    print("No overlapping elements found.")

No overlapping elements found.


In [None]:
# Extract unique counties
unique_counties = trainData_district_filtered['train_10']['District'].unique()

# Split the counties into train and test sets
train_counties, test_counties = train_test_split(unique_counties, test_size=0.1, random_state=42)

# Create train and test sets based on the split counties
train_data_district = trainData_district_filtered['train_10'][trainData_district_filtered['train_10']['District'].isin(train_counties)]
test_data_district = trainData_district_filtered['train_10'][trainData_district_filtered['train_10']['District'].isin(test_counties)]

trainData_district_before = {'train_10': train_data_district[ predictorList + ['FCSStaus', 'covid', 'District_covid', 'county_covid', 'subcounty_covid']]}
testData_district_before = {'test_10': test_data_district[ predictorList + ['FCSStaus', 'covid', 'District_covid', 'county_covid', 'subcounty_covid']]}

overlap = set(train_counties) & set(test_counties)
if overlap:
    print("Overlapping elements found:", overlap)
else:
    print("No overlapping elements found.")

No overlapping elements found.


In [None]:
# Extract unique counties
unique_counties = trainData_subcounty_filtered['train_10']['s1aq5a'].unique()

# Split the counties into train and test sets
train_counties, test_counties = train_test_split(unique_counties, test_size=0.1, random_state=42)

# Create train and test sets based on the split counties
train_data_subcounty = trainData_subcounty_filtered['train_10'][trainData_subcounty_filtered['train_10']['s1aq5a'].isin(train_counties)]
test_data_subcounty = trainData_subcounty_filtered['train_10'][trainData_subcounty_filtered['train_10']['s1aq5a'].isin(test_counties)]

trainData_subcounty_before = {'train_10': train_data_subcounty[ predictorList + ['FCSStaus', 'covid', 'District_covid', 'county_covid', 'subcounty_covid'] ]}
testData_subcounty_before = {'test_10': test_data_subcounty[ predictorList + ['FCSStaus', 'covid', 'District_covid', 'county_covid', 'subcounty_covid'] ]}

overlap = set(train_counties) & set(test_counties)
if overlap:
    print("Overlapping elements found:", overlap)
else:
    print("No overlapping elements found.")

No overlapping elements found.


In [None]:
# Generate time series data for different region levels: before COVID data 
StdTrain_district_during, StdTest_district_during, SMOTE_Train_district_during, ADASYN_Train_district_during, SMOTETOM_Train_district_during, SMOTEENN_Train_district_during = standardiza_resample_data_reindex(trainData_district_before, testData_district_before,  predictorList, standardizationList, binaryList, freqList, nonstdList)
StdTrain_county_during, StdTest_county_during, SMOTE_Train_county_during, ADASYN_Train_county_during, SMOTETOM_Train_county_during, SMOTEENN_Train_county_during = standardiza_resample_data_reindex(trainData_county_before, testData_county_before, predictorList, standardizationList, binaryList, freqList, nonstdList)
StdTrain_subcounty_during, StdTest_subcounty_during, SMOTE_Train_subcounty_during, ADASYN_Train_subcounty_during, SMOTETOM_Train_subcounty_during, SMOTEENN_Train_subcounty_during = standardiza_resample_data_reindex(trainData_subcounty_before, testData_subcounty_before, predictorList, standardizationList, binaryList, freqList, nonstdList)

In [None]:
data_before_district = {'StdTrain_district_during': StdTrain_district_during, 'StdTest_district_during': StdTest_district_during, 'SMOTE_Train_district_during': SMOTE_Train_district_during, 'ADASYN_Train_district_during': ADASYN_Train_district_during, 'SMOTETOM_Train_district_during': SMOTETOM_Train_district_during, 'SMOTEENN_Train_district_during': SMOTEENN_Train_district_during}
data_before_county = {'StdTrain_county_during': StdTrain_county_during, 'StdTest_county_during': StdTest_county_during, 'SMOTE_Train_county_during': SMOTE_Train_county_during, 'ADASYN_Train_county_during': ADASYN_Train_county_during, 'SMOTETOM_Train_county_during': SMOTETOM_Train_county_during, 'SMOTEENN_Train_county_during': SMOTEENN_Train_county_during}
data_before_subcounty = {'StdTrain_subcounty_during': StdTrain_subcounty_during, 'StdTest_subcounty_during': StdTest_subcounty_during, 'SMOTE_Train_subcounty_during': SMOTE_Train_subcounty_during, 'ADASYN_Train_subcounty_during': ADASYN_Train_subcounty_during, 'SMOTETOM_Train_subcounty_during': SMOTETOM_Train_subcounty_during, 'SMOTEENN_Train_subcounty_during': SMOTEENN_Train_subcounty_during}
# Define the output directory
output_dir = 'bld/datasets/generated/'

# Store each dictionary as a pickle file
with open(f'{output_dir}data_before_district.pkl', 'wb') as f:
    pickle.dump(data_before_district, f)
with open(f'{output_dir}data_before_county.pkl', 'wb') as f:
    pickle.dump(data_before_county, f)
with open(f'{output_dir}data_before_subcounty.pkl', 'wb') as f:
    pickle.dump(data_before_subcounty, f)