In [2]:
!pip install pandas numpy scikit-learn pytest



In [5]:
import pandas as pd
import numpy as np
import os
import pickle
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# データ読み込み
import pandas as pd

url = "https://raw.githubusercontent.com/yuriemaeda/lecture-ai-engineering/master/day5/%E6%BC%94%E7%BF%923/data/Titanic.csv"
df = pd.read_csv(url)

# 前処理パイプライン
def create_preprocessor():
    numeric_features = ["Age", "Pclass", "SibSp", "Parch", "Fare"]
    categorical_features = ["Sex", "Embarked"]

    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ])

    return preprocessor

# 学習＆評価
def train_and_evaluate(df):  # ← df を引数で受け取るように変更
    X = df.drop("Survived", axis=1)
    y = df["Survived"].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    preprocessor = create_preprocessor()
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)),
    ])

    model.fit(X_train, y_train)

    # 精度
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")

    # 推論時間
    start = time.time()
    model.predict(X_test)
    end = time.time()
    print(f"Inference Time: {end - start:.4f} seconds")

    # 再現性チェック
    model2 = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)),
    ])
    model2.fit(X_train, y_train)
    y_pred2 = model2.predict(X_test)
    print("Reproducible:", np.array_equal(y_pred, y_pred2))

# 実行
train_and_evaluate(df)

Accuracy: 0.8101
Inference Time: 0.0156 seconds
Reproducible: True
