In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set() # Revert to matplotlib defaults

import warnings
warnings.filterwarnings("ignore")

## Testing the model on sample data

In [3]:
import pickle
model = pickle.load(open("Titanic_Model.pkl","rb"))
encoder = pickle.load(open("Titanic_Encoder.pkl","rb"))

In [4]:
data = [2,'female',30,0,0,6.08,'C']

In [5]:
features =['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

In [6]:
test_df = pd.DataFrame(columns = features, data=np.array(data).reshape(1,-1))
test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,female,30,0,0,6.08,C


In [7]:
def test_preprocessing(test_df, encoder):
    cat_features = ['Pclass','Sex','Embarked']
    for feature in test_df.columns.values:
        if feature not in cat_features:
            test_df[feature] = test_df[feature].astype('float') #Converting numerical features to float data type
        if feature=="Pclass":
            test_df[feature] = test_df[feature].astype('int64') #Converting Pclass value to int data type

    #One-Hot Encoding the categorical features
    test_encode = encoder.transform(test_df[[i for i in cat_features]])
    categories=[]
    for feature in encoder.categories_:
        for category in feature:
            categories.append(category)

    encoded_df = pd.DataFrame(data=test_encode, columns=categories)

    #Merging the one-ho-encoded values with the test_df
    merged_df = pd.concat([test_df,encoded_df], axis=1)
    #Removing the categorical features from test_df
    test_df_preprocessed = merged_df.drop(cat_features,axis=1)
    return test_df_preprocessed

In [8]:
test_df_preprocessed = test_preprocessing(test_df,encoder)
model.predict(test_df_preprocessed)

array([1], dtype=int64)

## Testing the model on Kaggle Test Data

In [9]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


#### Preprocessing the test data

In [11]:
#Removing unnecessary columns
test_df_1 = test_df.drop(['PassengerId','Name','Cabin','Ticket'], axis=1)
test_df_1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [12]:
#Checking for null values
test_df_1.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [30]:
#Filling in the null values
test_df_nonull = test_df_1.copy()
#Calculating mean age values for each Pclass
pclass_dict={}
for i in test_df_1['Pclass'].unique():
    pclass_dict[i] = test_df_1.loc[test_df_1['Pclass'] == i]['Age'].mean()
    print(f"For Pclass {i}: Mean Age: {pclass_dict[i]}")

ImportError: cannot import name 'fill_na_age_func' from 'Titanic_test_functions' (D:\DataspellProjects\Data Science\Projects\Titanic-Survival-Prediction\Titanic_test_functions.py)

In [31]:
def fillna_age_test(cols, pclass_dict):
    Age = cols[0]
    Pclass = cols[1]

    if pd.isnull(Age):
        if Pclass == 1:
            return round(pclass_dict[1],2)
        elif Pclass == 2:
            return round(pclass_dict[2],2)
        else:
            return round(pclass_dict[3],2)
    else:
        return Age

In [35]:
for index, row in test_df_1.iterrows():
    cols = [row['Age'],row['Pclass']]
    test_df_nonull['Age'] = fillna_age_test(cols,pclass_dict)
test_df_nonull['Fare'] = test_df_nonull['Fare'].fillna(test_df_nonull['Fare'].mean())
test_df_nonull.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [36]:
#Preprocessing to make it model-compatible
test_df_preprocessed = test_preprocessing(test_df_nonull,encoder)

In [37]:
#Making final output dataframe
submission_df = pd.DataFrame(columns=['PassengerId','Survived'])
submission_df['PassengerId'] = test_df['PassengerId']
submission_df['Survived'] = model.predict(test_df_preprocessed)
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [38]:
#Exporting submission df
submission_df.to_csv("Titanic-Kaggle-Submission.csv",index=False)