## Pipeline

Pipeline, veri on isleme ve modelleme sirasinda kodu duzenli tutmanin bir yoludur. (metin hocada)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
path = "/workspaces/AI01_H06/melb_data.csv"
df = pd.read_csv(path)

X = df.drop(['Price'], axis=1)  # axis=1 sutun anlamina gelir
y = df.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)   # train_size default 0.7


In [None]:
# Veri on isleme yaparken target datasinda on isleme yapilmaz. ALTIN KURAL 
# Veri on isleme asamalari X_train ve X_test uzerinde yapilir

categorical_cols = [
    cname
    for cname in X_train.columns
    if X_train[cname].nunique() < 10 and X_train[cname].dtype == 'Object'   
]

numerical_cols = [
    cname
    for cname in X_train.columns
    if X_train[cname].dtype in ['int64', 'float64']
]

In [None]:
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

all_cols = categorical_cols+numerical_cols  # Butun sutunlari liste halinde tutmak icin

In [17]:
# On isleme asamalari (1. Adim)

from sklearn.compose import ColumnTransformer   # Kategorik ve numerik veriyi anyi anda isleme alabilmek icin bu columntransformer kullaniyoruz
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # pipeline eksik verilerle ugrasmak icin
from sklearn.preprocessing import OneHotEncoder  # cok populer, official doc. dan bak - Veri Biliminde kariyer yapacaklar bunu ezbere biliyor - youtube'da "anlasilir ekonomi" kanali, Python listesi 



In [19]:
numerical_transformers = SimpleImputer(strategy='constant')
categorical_transformers = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))   # one-hot encoding calisma mantigi, evernote, alpha numeric verileri sayisallastirir

])



In [20]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformers, numerical_cols),
        ('cat', categorical_transformers, categorical_cols)
    ]
)


In [21]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [22]:
# Tanimlama kismi
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])


In [23]:
from sklearn.metrics import mean_absolute_error

In [None]:
# Egitim kismi

my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_test)
score = mean_absolute_error(y_test, preds)
print(f"Score: {score}")   # 169829 USD

Score: 169829.65314783645


In [None]:
# 190k'den 169k'ya dustu on isleme ile
# Hoca der ki, bu on isleme aslinda cok basit. Daha detayli bir onisleme ile daha iyi sonuclar alinabilir.