# Prepare data (just for materialHardship)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
np.random.seed(1234)
print('reading in csv files')
# File read in
# See documentation for more descriptions
# http://www.fragilefamilieschallenge.org/apply/
background = pd.read_csv('/Users/wnowak/gitt/ffc/materialHardship/Data/original_csvs/background.csv', low_memory=False)
print('read background.csv')
train = pd.read_csv('/Users/wnowak/gitt/ffc/materialHardship/Data/original_csvs/train.csv', low_memory=False)
print('read train.csv')
prediction = pd.read_csv('/Users/wnowak/gitt/ffc/materialHardship/Data/original_csvs/prediction.csv', low_memory=False)
print('read prediction.csv')
print("Files read")

reading in csv files
read background.csv
read train.csv
read prediction.csv
Files read


Background is super WIDE. We have only 4242 IDs, but almost 13k features!

In [3]:
background.shape

(4242, 12943)

OK...so they want predictions for...everything. So we are going to make a **training** set using the rows of background that have non-NA values for materialHardship.

Then we need to make predictions for every challengeID that is a) not train or b) is in train by has NA for materialHardship 

In [4]:
all_y_train_materialHardship = train[['challengeID', 'materialHardship']]
# non NA y_train ds and data
non_na_y_train_materialHardship = all_y_train_materialHardship.dropna()
non_na_y_train_materialHardship.head()

Unnamed: 0,challengeID,materialHardship
2,6,0.090909
3,7,0.0
5,9,0.181818
6,10,0.090909
7,13,0.181818


In [5]:
# y_train_ids that are na
na_y_train_materialHardship = all_y_train_materialHardship.loc[~all_y_train_materialHardship['challengeID'].isin(non_na_y_train_materialHardship['challengeID'])]
na_y_train_materialHardship.head()

Unnamed: 0,challengeID,materialHardship
0,1,
1,3,
4,8,
14,28,
17,36,


Our training set will have all ids that we have non-NA material hardship data for. 

In [6]:
df_train = background.loc[background['challengeID'].isin(non_na_y_train_materialHardship['challengeID'])]
df_train.head()

Unnamed: 0,challengeID,m1intmon,m1intyr,m1lenhr,m1lenmin,cm1twoc,cm1fint,cm1tdiff,cm1natsm,m1natwt,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
5,6,-3,,0,25,,1,,,,...,8.5157,10.558813,-3.0,-3.0,7.022328,-3.0,10.564085,-3,-3.0,10.255825
6,7,-3,,0,35,,1,,,,...,-3.0,-3.0,9.660643,9.861125,-3.0,10.991854,-3.0,-3,10.972726,10.8598
8,9,-3,,0,30,,1,,,,...,-3.0,-3.0,11.689877,9.373199,-3.0,8.194868,-3.0,-3,9.84238,9.566678
9,10,-3,,0,33,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,10.564085,-3,-3.0,10.10587
12,13,-3,,1,-3,,1,,,,...,10.78074,-3.0,-3.0,-3.0,8.944119,-3.0,9.781718,-3,-3.0,9.566678


The prediction set contains all rows in 'background.csv' that are not in df_train.

In [7]:
df_prediction = background.loc[~background['challengeID'].isin(df_train['challengeID'])]
df_prediction.sort_values(by='challengeID').head(10)

Unnamed: 0,challengeID,m1intmon,m1intyr,m1lenhr,m1lenmin,cm1twoc,cm1fint,cm1tdiff,cm1natsm,m1natwt,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
0,1,-3,,-3,40,,0,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,-3.0
1,2,-3,,0,40,,1,,,,...,-3.0,8.473318,-3.0,-3.0,-3.0,-3.0,9.845074,-3,-3.0,9.723551
2,3,-3,,0,35,,1,,,,...,-3.0,-3.0,9.097495,10.071504,-3.0,-3.0,-3.0,-3,-3.0,-3.0
3,4,-3,,0,30,,1,,,,...,-3.0,-3.0,9.512706,10.286578,-3.0,10.677285,-3.0,-3,8.522331,10.608137
4,5,-3,,0,25,,1,,,,...,-3.0,-3.0,11.076016,9.615958,-3.0,9.731979,-3.0,-3,10.115313,9.646466
7,8,-3,,1,10,,1,,,,...,-3.0,10.558813,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,-3.0
10,11,-3,,0,40,,1,,,,...,-3.0,10.558813,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,10.724639
11,12,-3,,0,23,,1,,,,...,11.548771,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,9.566678
14,15,-3,,0,25,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,10.085272,-3,-3.0,10.255825
16,17,-3,,-3,32,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,10.991854,-3.0,-3,10.972726,10.415491


SO, we have the data that we will train on:
    
    - Training features: 'df_train' 
    - Training labels: 'non_na_y_train_materialHardship'

We'll use this model to predict output (materialHardship) values for:

    - Prediction features: 'df_prediction'

### df_train

In [8]:
df_train.head()

Unnamed: 0,challengeID,m1intmon,m1intyr,m1lenhr,m1lenmin,cm1twoc,cm1fint,cm1tdiff,cm1natsm,m1natwt,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
5,6,-3,,0,25,,1,,,,...,8.5157,10.558813,-3.0,-3.0,7.022328,-3.0,10.564085,-3,-3.0,10.255825
6,7,-3,,0,35,,1,,,,...,-3.0,-3.0,9.660643,9.861125,-3.0,10.991854,-3.0,-3,10.972726,10.8598
8,9,-3,,0,30,,1,,,,...,-3.0,-3.0,11.689877,9.373199,-3.0,8.194868,-3.0,-3,9.84238,9.566678
9,10,-3,,0,33,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,10.564085,-3,-3.0,10.10587
12,13,-3,,1,-3,,1,,,,...,10.78074,-3.0,-3.0,-3.0,8.944119,-3.0,9.781718,-3,-3.0,9.566678


### non_na_y_train_materialHardship

In [9]:
non_na_y_train_materialHardship.head()

Unnamed: 0,challengeID,materialHardship
2,6,0.090909
3,7,0.0
5,9,0.181818
6,10,0.090909
7,13,0.181818


### df_prediction

In [10]:
df_prediction.head()

Unnamed: 0,challengeID,m1intmon,m1intyr,m1lenhr,m1lenmin,cm1twoc,cm1fint,cm1tdiff,cm1natsm,m1natwt,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
0,1,-3,,-3,40,,0,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,-3.0
1,2,-3,,0,40,,1,,,,...,-3.0,8.473318,-3.0,-3.0,-3.0,-3.0,9.845074,-3,-3.0,9.723551
2,3,-3,,0,35,,1,,,,...,-3.0,-3.0,9.097495,10.071504,-3.0,-3.0,-3.0,-3,-3.0,-3.0
3,4,-3,,0,30,,1,,,,...,-3.0,-3.0,9.512706,10.286578,-3.0,10.677285,-3.0,-3,8.522331,10.608137
4,5,-3,,0,25,,1,,,,...,-3.0,-3.0,11.076016,9.615958,-3.0,9.731979,-3.0,-3,10.115313,9.646466


We should have that the sum of rows in df_train and df_prediction sum to all of the rows in background.csv. Let's ensure that.

In [11]:
np.sum(df_train.shape[0]+df_prediction.shape[0])==background.shape[0]

True

OK. We are off to the races and can reference these pandas dataframes in future data preparation and training notebooks!

In [12]:
df_train.to_pickle('df_train')
df_prediction.to_pickle('df_prediction')
non_na_y_train_materialHardship.to_pickle('non_na_y_train_materialHardship')