# Kaggle Submission

We process all of the data in test.csv from the raw folder. The following steps are performed to prepare submission:

1. Data cleaning
2. Age imputation
3. Survival Prediction
4. Data merging
5. Saving data to file

# Lib Imports

In [9]:
import pickle
import os
import pandas as pd

# Data loading and cleaning

In [10]:
project_root = os.path.join(os.getcwd(), os.pardir)
raw_data_dir = os.path.join(project_root, 'data', 'raw')
processed_data_dir = os.path.join(project_root, 'data', 'processed')


# load up the raw data
raw_df = pd.read_csv(os.path.join(raw_data_dir, 'test.csv'))

# load up the special processed data
proc_df = pd.read_csv(os.path.join(processed_data_dir, 'test.csv'))

In [11]:
raw_df.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [12]:
proc_df.head(2)

Unnamed: 0,Age,SibSp,Parch,Fare,male,Q,S,2,3,Master,Miss,Mr,Mrs,Other,Rev
0,34.5,0,0,7.8292,1,1,0,0,1,0,0,1,0,0,0
1,47.0,1,0,7.0,0,0,1,0,1,0,0,0,1,0,0


In [22]:
# there is a null fare... lets just take the average of that class price

avg_fare = proc_df[proc_df['3'] == 1]['Fare'].mean()
proc_df[proc_df['Fare'].isnull()] = avg_fare

# Load Models and Predict

In [13]:
model_root = os.path.join(project_root, 'models')

with open(os.path.join(model_root, 'voting.pickle'), 'rb') as f:
    model = pickle.load(f)
    
print(type(model))

<class 'sklearn.ensemble.voting_classifier.VotingClassifier'>


In [24]:
predictions = model.predict(proc_df)

# Data merging

In [29]:
output_df = pd.DataFrame({
    'PassengerId': raw_df['PassengerId'],
    'Survived': predictions
})

In [30]:
output_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


# Save data

In [31]:
submit_root = os.path.join(project_root, 'data', 'submit')
output_df.to_csv(os.path.join(submit_root, 'predicted.csv'), index=False)