In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import pingouin as pg
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [2]:
df = sns.load_dataset('titanic')

In [4]:
l_sex = preprocessing.LabelEncoder()
l_sex.fit(["male", "female"])

LabelEncoder()

In [5]:
df.sex = l_sex.transform(df.sex)

In [6]:
y = df.alive
X = df.drop(columns=['alive'])

# NA Replaced with median value in age, MCAR missing

In [7]:
# X.age[X.age.isna()] = X.age.median()
# X.age[X.age.isna()] = X.age.mean()

# alternatives
# df.age.fillna(df.age.median(), inplace=True)
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(X.age.to_numpy().reshape(-1, 1))
X.age = imp_median.transform(X.age.to_numpy().reshape(-1, 1))

In [8]:
X.age.isna().sum()

0

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

In [10]:
regr = LogisticRegression()

# Model training with fit function
regr.fit(X_train[['age', 'sex', 'fare']], y_train)

# Result with the trained model
regr.score(X_test[['age', 'sex', 'fare']], y_test)

0.776536312849162

In [11]:
print(X.age.mean(), X.age.median())

29.36158249158249 28.0


# MAR single linear regression

In [3]:
from sklearn.linear_model import LinearRegression
import seaborn as sns

In [12]:
df = sns.load_dataset('titanic')

l_sex = preprocessing.LabelEncoder()
l_sex.fit(["male", "female"])

df.sex = l_sex.transform(df.sex)

y = df.alive
X = df.drop(columns=['alive'])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=4)

In [5]:
X.age.isna().sum()

177

In [6]:
# is there a missing data in fare
X.fare.isna().sum()

0

# Fitting a Linear regression for FARE to age

In [7]:
lreg = LinearRegression()

x = X[X.age.isna() != True]

lreg.fit(x.fare.to_numpy().reshape(-1, 1), x.age)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
# Nan values that we are going to estimate
age_nan = X[X.age.isna() == True]

In [10]:
pred = lreg.predict(age_nan.fare.to_numpy().reshape(-1, 1))

In [11]:
# assiging predicted values
X.age[X.age.isna() == True] = pred

In [12]:
X.age.isna().sum()

0

In [13]:
# prediction with MAR features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=4)
regr = LogisticRegression()

regr.fit(X_train[['age', 'sex', 'fare']], y_train)
regr.score(X_test[['age', 'sex', 'fare']], y_test)

0.8026905829596412