<a href="https://colab.research.google.com/github/yeyevtushenko/AI/blob/Lesson13.03.2024/Lesson13_03_2024_AI_H_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/HalyshAnton/IT-Step-Pyton-AI/main/module2/data/ship_passengers.csv",
                 index_col="Unnamed: 0")

In [3]:
df.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [4]:
df = df[['pclass', 'age', 'embarked', 'fare']]
df.head()

Unnamed: 0,pclass,age,embarked,fare
0,3,22.0,S,7.25
1,1,38.0,C,71.2833
2,3,26.0,S,7.925
3,1,35.0,S,53.1
4,3,35.0,S,8.05


# Завдання 1
Очистіть дані від викидів(лише `fare`), розділіть на тренувальну та тестову чатини

In [5]:
def remove_outliers(df, column_name):
  column = df[column_name]

  q1 = column.quantile(q=0.25)
  q3 = column.quantile(q=0.75)

  iqr = q3 - q1

  lower = q1 - 1.5*iqr
  upper = q3 + 1.5*iqr

  mask = (column >= lower) & (column <= upper)

  return df[mask]


df = remove_outliers(df, "fare")

In [6]:
df.dropna(subset=["fare"], inplace=True)

In [7]:
y = df['fare']
X = df.drop(columns='fare')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Завдання 2
Створіть Pipeline для обробки даних

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder


In [9]:
num_columns = X.select_dtypes(include="number").columns

num_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

In [10]:
cat_columns = X.select_dtypes(include="object").columns
cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ])

# Завдання 3
Створіть остаточну модель та натренуйте її

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_columns),
        ("cat", cat_transformer, cat_columns),
    ]
)

preprocessor


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn import tree

In [13]:
def create_model(depth):
  return Pipeline(
    [
        ("prep", preprocessor),
        ("model", tree.DecisionTreeRegressor(max_depth=depth, max_features=10))
    ]
  )

model = create_model(4)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
model

# Завдання 4
Виведіть основні метрики моделі

In [14]:
from sklearn import metrics

In [15]:
def get_metrics(y_true, y_pred, name="model"):
  df = pd.DataFrame()

  df.loc["MAE", name] = metrics.mean_absolute_error(y_true, y_pred)
  df.loc["RMSE", name] = metrics.mean_squared_error(y_true, y_pred) ** 0.5
  df.loc["R2", name] = metrics.r2_score(y_true, y_pred)

  return df.round(2)

all_metrics = pd.DataFrame()

all_metrics["train"] = get_metrics(y_train, model.predict(X_train))
all_metrics["test"] = get_metrics(y_test, y_pred)

all_metrics

Unnamed: 0,train,test
MAE,6.27,8.57
RMSE,8.77,13.27
R2,0.53,0.3


# Завдання 5
Змініть параметри моделі та попробуйте покращити результат

Попробуйте:
* не видаляти викиди
* змінити обробку даних(imputer та кодування категоріальних даних)
* змінити параметри дерева(глибина, кількість точок у листках тощо

In [18]:
from sklearn.tree import DecisionTreeRegressor

In [19]:
all_metrics = pd.DataFrame()

for depth in range(3, 19):
    model = Pipeline([
        ("prep", preprocessor),
        ("model", DecisionTreeRegressor(max_depth=depth, random_state=42))
    ])

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_metrics = get_metrics(y_train, y_train_pred, name=f"train_depth_{depth}")
    test_metrics = get_metrics(y_test, y_test_pred, name=f"test_depth_{depth}")

    all_metrics = pd.concat([all_metrics, train_metrics, test_metrics], axis=1)

all_metrics.round(2)

Unnamed: 0,train_depth_3,test_depth_3,train_depth_4,test_depth_4,train_depth_5,test_depth_5,train_depth_6,test_depth_6,train_depth_7,test_depth_7,...,train_depth_14,test_depth_14,train_depth_15,test_depth_15,train_depth_16,test_depth_16,train_depth_17,test_depth_17,train_depth_18,test_depth_18
MAE,6.56,8.27,6.27,8.57,5.94,8.86,5.5,8.84,5.11,9.08,...,4.33,9.46,4.33,9.46,4.32,9.46,4.32,9.46,4.32,9.46
RMSE,9.23,12.53,8.77,13.27,8.46,13.93,8.11,13.78,7.77,14.19,...,7.15,14.84,7.15,14.84,7.15,14.84,7.15,14.84,7.15,14.84
R2,0.48,0.38,0.53,0.3,0.57,0.23,0.6,0.25,0.63,0.2,...,0.69,0.13,0.69,0.13,0.69,0.13,0.69,0.13,0.69,0.13


# Завдання 6
Збережіть модель

In [20]:
import joblib

joblib.dump(model, 'new_model.joblib')

new_model = joblib.load('new_model.joblib')

new_model