### Napsat funkci, která přijímá jako parametry data, která obsahují chybějící hodnoty, a metodu zpracování chybějících hodnot. Funkce optimalizuje parametry každé metody pro minimum kritéria mse (n_neighbors v prvním případě a max_iter ve druhém), poté z obou metod vybere tu nejlepší a vrátí data bez chybějících hodnot.

In [4]:
import pandas as pd
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def mice(data, max_iter):
    mice = IterativeImputer(max_iter=max_iter, random_state=123)

    sleep_mice_imputed = pd.DataFrame(mice.fit_transform(data),
                                    columns=data.columns)
    
    y = sleep_mice_imputed['total_sleep']
    sleep_mice_imputed.loc[:,'max_life_span_2'] = sleep_mice_imputed['max_life_span']**2
    sleep_mice_imputed.loc[:,'gestation_time_2'] = sleep_mice_imputed['gestation_time']**2
    X = sleep_mice_imputed[['max_life_span','max_life_span_2','gestation_time','gestation_time_2']]

    model = LinearRegression().fit(X, y)

    y_pred = model.predict(X)

    mse = mean_squared_error(y, y_pred)

    return round(mse,2), sleep_mice_imputed


def knn_method(data, n_neighbors):
    knn = KNNImputer(n_neighbors=n_neighbors)

    sleep_knn_imputed = pd.DataFrame(knn.fit_transform(data),
                                   columns=data.columns)
    
    y = sleep_knn_imputed['total_sleep']
    sleep_knn_imputed.loc[:,'max_life_span_2'] = sleep_knn_imputed['max_life_span']**2
    sleep_knn_imputed.loc[:,'gestation_time_2'] = sleep_knn_imputed['gestation_time']**2
    X = sleep_knn_imputed[['max_life_span','max_life_span_2','gestation_time','gestation_time_2']]

    model = LinearRegression().fit(X, y)

    # Predict the values
    y_pred = model.predict(X)

    mse = mean_squared_error(y, y_pred)

    return round(mse,2), sleep_knn_imputed

def knn_vs_mice(data, max_iter, k_neighbors):
    knn_list = []
    mice_list = []
    for i in range(1, k_neighbors):
        mse, imputed_data = knn_method(data, i)
        knn_list.append((mse, i, imputed_data))

    for j in range(1, max_iter):
        mse, imputed_data = mice(data, j)
        mice_list.append((mse, j, imputed_data))

    print(f"Seznam Mice: {mice_list}")
    print(f"Seznam Mice: {knn_list}")

    min_mice_mse, mice_iter, mice_imputed_data = min(mice_list, key=lambda x: x[0])
    min_knn_mse, knn_neighbors, knn_imputed_data = min(knn_list, key=lambda x: x[0])

    if min_mice_mse < min_knn_mse:
        return mice_iter, min_mice_mse, mice_imputed_data
    elif min_mice_mse > min_knn_mse:
        return knn_neighbors, min_knn_mse, knn_imputed_data
    else:
        return mice_iter, min_mice_mse, mice_imputed_data

df_sleep = pd.read_csv('dataset_sleep.csv')
df_sleep['max_life_span'] = pd.to_numeric(df_sleep['max_life_span'], errors='coerce')
df_sleep['gestation_time'] = pd.to_numeric(df_sleep['gestation_time'], errors='coerce')
df_sleep['total_sleep'] = pd.to_numeric(df_sleep['total_sleep'], errors='coerce')

s = knn_vs_mice(df_sleep, 2, 5)
xxx = pd.DataFrame(s)
xxx.isnull()

Seznam Mice: [(10.64, 1,     body_weight  brain_weight  max_life_span  gestation_time  predation_index   
0      6654.000        5712.0      38.600000           645.0              3.0  \
1         1.000           6.6       4.500000            42.0              3.0   
2         3.385          44.5      14.000000            60.0              1.0   
3         0.920           5.7      12.254055            25.0              5.0   
4      2547.000        4603.0      69.000000           624.0              3.0   
..          ...           ...            ...             ...              ...   
57        2.000          12.3       7.500000           200.0              3.0   
58        0.104           2.5       2.300000            46.0              3.0   
59        4.190          58.0      24.000000           210.0              4.0   
60        3.500           3.9       3.000000            14.0              2.0   
61        4.050          17.0      13.000000            38.0              3.0   

  



Unnamed: 0,0
0,False
1,False
2,False
