# Apply the Random Forest regression technique to control the quality of missing value handling method by calculation of the mean square error (mse) for titanic dataset. Age is a target variable. Enumerate various techniques of missing values handling to select the best method corresponded to a maximum value of data classification accuracy.

In [103]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.linear_model import LinearRegression

In [105]:
data = pd.read_csv("https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv")

X = data.drop(columns='Age')
y = data['Age']
print(X.shape)
print(y.shape)

# Imputace chybějících hodnot s MICE
imputer_MICE = IterativeImputer(random_state=123)
X_MICE = imputer_MICE.fit_transform(X.select_dtypes(include=['float64', 'int64']))

# Imputace chybějících hodnot s KNN
imputer_KNN = KNNImputer()
X_KNN = imputer_KNN.fit_transform(X.select_dtypes(include=['float64', 'int64']))

# Imputace chybějících hodnot s lineární regresí
imputer_linear = IterativeImputer(estimator=LinearRegression())
X_linear = imputer_linear.fit_transform(X.select_dtypes(include=['float64', 'int64']))

# Rozdělení dat na tréninkovou a testovací sadu pro MICE
X_train_MICE, X_test_MICE, y_train_MICE, y_test_MICE = train_test_split(X_MICE, y, test_size=0.3, random_state=123)

# Rozdělení dat na tréninkovou a testovací sadu pro KNN
X_train_KNN, X_test_KNN, y_train_KNN, y_test_KNN = train_test_split(X_KNN, y, test_size=0.3, random_state=123)

# Rozdělení dat na tréninkovou a testovací sadu pro lineární regresi
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear, y, test_size=0.3, random_state=123)

# Vytvoření a trénink Random Forest modelu pro MICE
model_MICE = RandomForestRegressor(random_state=123)
model_MICE.fit(X_train_MICE, y_train_MICE)

# Vytvoření a trénink Random Forest modelu pro KNN
model_KNN = RandomForestRegressor(random_state=123)
model_KNN.fit(X_train_KNN, y_train_KNN)

# Vytvoření a trénink Random Forest modelu pro lineární regresi
model_linear = RandomForestRegressor(random_state=123)
model_linear.fit(X_train_linear, y_train_linear)

# Predikce a výpočet chyby středního kvadratického (MSE) pro MICE
predictions_MICE = model_MICE.predict(X_test_MICE)
mse_MICE = mean_squared_error(y_test_MICE, predictions_MICE)

# Predikce a výpočet chyby středního kvadratického (MSE) pro KNN
predictions_KNN = model_KNN.predict(X_test_KNN)
mse_KNN = mean_squared_error(y_test_KNN, predictions_KNN)

# Predikce a výpočet chyby středního kvadratického (MSE) pro lineární regresi
predictions_linear = model_linear.predict(X_test_linear)
mse_linear = mean_squared_error(y_test_linear, predictions_linear)

# Výpis výsledků MSE pro všechny metody
print("MSE for KNN Imputation:", mse_KNN)
print("MSE for MICE Imputation:", mse_MICE)
print("MSE for Linear Regression Imputation:", mse_linear)

(887, 7)
(887,)
MSE for KNN Imputation: 156.9735896822593
MSE for MICE Imputation: 156.9735896822593
MSE for Linear Regression Imputation: 156.9735896822593
