# Used to examine and develop methods for feature imputation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from random import randint
import pickle

In [2]:
data_path = '/home/jovyan/work/data/train.csv'
df = pd.read_csv(data_path)

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

### First Build columns to impute the Age

In [6]:
gender = {
    'male': 1,
    'female':0
}

embark = {
    'S': 1,
    'C':2,
    'Q':3
}

df['Sex'] = df['Sex'].apply(lambda x: gender.get(x))
df['Embarked'] = df['Embarked'].apply(lambda x: embark.get(x))

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,2.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,1.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,1.0


<strong> As a baseline for performance for assessing the age imputation methods I am going to impute a random value between the min and max. </strong>

In [8]:
def rand_age(cell_value, age_min, age_max):
    if np.isnan(cell_value) == True:
        return randint(age_min, age_max)
    else:
        return cell_value

In [9]:
age_min = round(df['Age'].min())
age_max = df['Age'].max()
df['imp_age_rand'] = df['Age'].apply(rand_age, args=(age_min, age_max))

<strong> I am going to build two new columns to impute the median and mean age. </strong>

In [10]:
df['imp_age_mean'] = df['Age']
df['imp_age_median'] = df['Age']
df['imp_age_median'].fillna((df['imp_age_mean'].median()), inplace=True)
df['imp_age_mean'].fillna((df['imp_age_mean'].mean()), inplace=True)

##  Train model to impute age both Deterministic and Stochastic linear models

In [11]:
df2 = df.copy()
drop_list = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'imp_age_rand', 'imp_age_mean', 'imp_age_median']
df2 = df2.drop(drop_list, 1)
df2.dropna(inplace=True)
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,1.0
1,1,1,0,38.0,1,0,71.2833,2.0
2,1,3,0,26.0,0,0,7.925,1.0
3,1,1,0,35.0,1,0,53.1,1.0
4,0,3,1,35.0,0,0,8.05,1.0


In [12]:
df2.isnull().any()

Survived    False
Pclass      False
Sex         False
Age         False
SibSp       False
Parch       False
Fare        False
Embarked    False
dtype: bool

In [26]:
#Create the test and train data sets
x = df2.drop(['Age', 'Survived'], axis=1)
y = df2['Age']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)

<strong> Build the Deterministic Linear Model First </strong>

In [27]:
lin_model = LinearRegression()
lin_model.fit(x_train, y_train)

LinearRegression()

In [28]:
y_pred = lin_model.predict(x_test)
# The coefficients
print('Coefficients: \n', lin_model.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The root mean squared error
rms = sqrt(mean_squared_error(y_test, y_pred))
print(f'The Root Mean squared error: {rms}')
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

Coefficients: 
 [-6.57754312  4.35156865 -4.07885134 -0.59404415 -0.00907026 -0.92198198]
Mean squared error: 182.64
The Root Mean squared error: 13.514511896462773
Coefficient of determination: 0.17


In [29]:
#Save the Model
filename = '/home/jovyan/work/src/app/assets/models/data_preprocessing_models/age_lin_model_11_30_2020.sav'
pickle.dump(lin_model, open(filename, 'wb'))

<strong> Now add a new column for the Deterministic Linear Regression </strong>

In [30]:
df2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,1.0
1,1,1,0,38.0,1,0,71.2833,2.0
2,1,3,0,26.0,0,0,7.925,1.0
3,1,1,0,35.0,1,0,53.1,1.0
4,0,3,1,35.0,0,0,8.05,1.0


In [31]:
def lin_model_age(df):
    if np.isnan(df['Age']) == True:
        xnew = [[df['Pclass'], df['Sex'], df['SibSp'],df['Parch'], df['Fare'], df['Embarked']]]
        pred_age = lin_model.predict(xnew)[0]
        return pred_age
    else:
        return df['Age']

In [32]:
#Note df2 was used to train the model, we are now applying this model to df1
df['imp_age_det'] = df.apply(lin_model_age, axis=1)

<strong> Now to add some randomness to the linear age regression we will use a stochastic method. </strong>

In [33]:
def sto_lin_model_age(df, std_error):
    if np.isnan(df['Age']) == True:
        xnew = [[df['Pclass'], df['Sex'], df['SibSp'],df['Parch'], df['Fare'], df['Embarked']]]
        pred_age = lin_model.predict(xnew)[0]
        sto_age = np.random.normal(loc=pred_age, scale=std_error)
        return(sto_age)
    
    else:
        return df['Age']

In [34]:
std_error = df['Age'].sem(axis=0, )
std_error = [getattr(std_error, "tolist", lambda: std_error)()]
df['imp_age_det'] = df.apply(sto_lin_model_age, args= (std_error), axis=1)

In [35]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,imp_age_rand,imp_age_mean,imp_age_median,imp_age_det
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,1.0,22.0,22.0,22.0,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,2.0,38.0,38.0,38.0,38.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,1.0,26.0,26.0,26.0,26.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,1.0,35.0,35.0,35.0,35.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,1.0,35.0,35.0,35.0,35.0
5,6,0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,,3.0,77.0,29.699118,28.0,26.853194
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,1.0,54.0,54.0,54.0,54.0
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,1.0,2.0,2.0,2.0,2.0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,,1.0,27.0,27.0,27.0,27.0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,,2.0,14.0,14.0,14.0,14.0


In [93]:
df.isnull().any()

PassengerId       False
Survived          False
Pclass            False
Name              False
Sex               False
Age                True
SibSp             False
Parch             False
Ticket            False
Fare              False
Cabin              True
Embarked           True
imp_age_rand      False
imp_age_mean      False
imp_age_median    False
imp_age_det       False
dtype: bool

### With the imputation models complete and saved these will be used in the data_preprocessing_final notebook to train the primary prediction models