In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U scikit-learn

In [None]:
import unicodedata
from pathlib import Path
import optuna
import catboost as cb
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from category_encoders import CountEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import TargetEncoder


In [None]:
import sklearn
sklearn.__version__

In [None]:
INPUT_DIR = Path("/kaggle/input/signate-used-car-price/")

In [None]:

train_df = pd.read_csv(INPUT_DIR / "train.csv")
test_df = pd.read_csv(INPUT_DIR / "test.csv")

sub_df = pd.read_csv(INPUT_DIR / "submit_sample.csv", names=["id", "price"])

In [None]:
# train_df.query("price > 45000")

In [None]:
# train_filtered  = train_df[train_df['price'] <= 50000].reset_index(drop=True)


In [None]:
test_df.isnull().sum()
# train_df.head()

In [None]:
class PreProcessTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, threshold_year=2000):
        self.threshold_year = threshold_year

    def fit(self, X, y=None):#訓練データに対してのみ呼び出されるもので、データに基づいて変換器を訓練する役割
        self.avg_price_per_year = X[X['year'] > self.threshold_year].groupby('year')['price'].mean().to_dict()
        return self

    def transform(self, X):#任意のデータセットに対して呼び出されるもので、fitで計算されたパラメータを使用してデータを変換する役割
        # regionからstateを推測して欠損値を補完する関数
        def fill_state_from_region(df):
            region_state = {region: {} for region in df['region'].unique()}
            for row, value in df.iterrows():
                if not pd.isna(value['state']):
                    if value['state'] not in region_state[value['region']]:
                        region_state[value['region']][value['state']] = 1
                    else:
                        region_state[value['region']][value['state']] += 1

            for region, state_dict in region_state.items():
                if len(state_dict) > 1 or state_dict == {}:
                    region_state[region] = pd.NA
                else:
                    region_state[region] = list(state_dict.keys())[0]

            df['state'] = [region_state[region] if pd.isna(state) else state for region, state in zip(df['region'], df['state'])]
            
            region_to_state = {
                'northwest KS': 'ks',
                'southern WV': 'wv',
                'ashtabula': 'oh',
            }
            # 補完できなかったstateを決定
            df['state'] = [
                region_to_state.get(region, state) if pd.isna(state) else state
                for region, state in zip(df['region'], df['state'])
            ]

            return df
        
        X = fill_state_from_region(X)
        
        # 年度ごとの平均価格を追加
        # 訓練データで計算した年度ごとの平均価格を使用
#         X['yearly_avg_price'] = X['year'].apply(lambda x: self.avg_price_per_year.get(x, np.nan) if x > self.threshold_year else np.nan)

        
        # cylinderから数値を取り出す
        X["cylinders"] = X["cylinders"].astype(str).str.extract("(\d+)").astype("float32")

        # sizeの表記揺れを修正
        X["size"] = X["size"].str.replace("ー", "-").astype(str)
        X["size"] = X["size"].str.replace("−", "-").astype(str)

        # manufacturerの表記揺れを修正
        X["manufacturer"] = X["manufacturer"].apply(
            lambda x: unicodedata.normalize("NFKC", x).lower()
        )
        manufacturer_mapping = {
        "toyotа": "toyota",
        "subαru": "subaru",
        "niѕsan": "nissan",
        "nisѕan": "nissan",
        "sαturn": "saturn",
        "lexuѕ": "lexus",
        "vоlkswagen": "volkswagen",
        "аcura": "acura",
        "ᴄhrysler": "chrysler",
        "land rover": "rover" # 仮に"land rover"を"roover"に統一する場合
        }

        X["manufacturer"] = X["manufacturer"].replace(manufacturer_mapping)


        # yearが2500年以降のものはおかしいので2000年代に置換
        err_idx = X.query("year >= 2500").index
        X.loc[err_idx, "year"] = X.loc[err_idx, "year"].apply(lambda x: x - 1000)

        
        # 走行距離が負はおかしいので0に
        err_idx_odo = X.query("odometer < 0").index
        X.loc[err_idx_odo, "odometer"] = X.loc[err_idx_odo, "odometer"].apply(lambda x: 0)
        
#         # 走行距離が400000超えはおかしいので一桁落とす
#         err_idx_odo = X.query("odometer > 400000").index
#         X.loc[err_idx_odo, "odometer"] = X.loc[err_idx_odo, "odometer"] // 10

#         # 走行距離が400000超えはおかしいので、該当する行を削除する
#         X = X.query("odometer <= 400000")

        
        # 走行距離 / 製造年
        X["odometer_per_year"] = X["odometer"] / (2023 - X["year"])
        
        
        return X


class RankTransformer(TransformerMixin, BaseEstimator):
    """keyの中でvalueが何番目のものか"""

    def __init__(self, key: str, value: str):
        self.key = key
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()
        X_new[self.key] = X_new.groupby(self.key)[self.value].rank(method="dense")
        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self, input_features=None):
        return [self.key]
    
    
class OriginalTransformer(TransformerMixin, BaseEstimator):
    """数値特徴はそのまま、カテゴリ特徴はcategory型に変換"""

    def __init__(self, numeric_cols, categorical_cols):
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()

        # 数値変数
        X_new[self.numeric_cols] = X_new[self.numeric_cols].astype("float32")

        # カテゴリ変数
        X_new[self.categorical_cols] = X_new[self.categorical_cols].astype("category")

        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self):
        return self.numeric_cols + self.categorical_cols
    
    
    
# class CountTransformer(TransformerMixin, BaseEstimator):
#     """CountEncoder"""

#     def __init__(self):
#         pass

#     def fit(self, X, y=None):
#         self.ce = CountEncoder(cols=X.columns.tolist(), handle_unknown=0)
#         self.ce.fit(X)
#         return self

#     def transform(self, X):
#         return self.ce.transform(X)

#     def get_feature_names_out(self, input_features=None):
#         return input_features
class CountTransformer(TransformerMixin, BaseEstimator):
    """CountEncoder"""

    def __init__(self, categorical_cols):  # この行を変更
        self.categorical_cols = categorical_cols

    def fit(self, X, y=None):
        self.ce = CountEncoder(cols=self.categorical_cols, handle_unknown=0)
        self.ce.fit(X[self.categorical_cols])  # この行を変更
        return self

    def transform(self, X):
        return self.ce.transform(X[self.categorical_cols])  # この行を変更

    def get_feature_names_out(self, input_features=None):
        return self.categorical_cols

    
    
    
class AggTransformer(TransformerMixin, BaseEstimator):
    """集約特徴量"""

    def __init__(self, key, numeric_cols, agg_func: dict):
        self.key = key
        self.numeric_cols = numeric_cols
        self.agg_func = agg_func

    def fit(self, X, y=None):
        X = X.copy()
        X[self.key] = X[self.key].astype("category")
        self.agg_df = X.groupby(self.key)[self.numeric_cols].agg(self.agg_func)
        self.agg_df.columns = [f"{col}_{func}" for col, func in self.agg_df.columns.tolist()]

        return self

    def transform(self, X):
        X_new = pd.merge(X, self.agg_df, on=self.key, how="left")
        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self, input_features=None):
        return self.agg_df.columns.tolist()

In [None]:
# numeric_cols = ["year", "odometer", "odometer_per_year"]
numeric_cols = ["year", "odometer"]
categorical_cols = [
#      'region',
    "cylinders",
    "manufacturer",
    "condition",
    "fuel",
    "title_status",
    "transmission",
    "drive",
    "size",
    "type",
    "paint_color",
    "state",
]

ct = ColumnTransformer(
    transformers=[
        (
            "ori",
            OriginalTransformer(numeric_cols, categorical_cols),
            categorical_cols + numeric_cols,
        ),
#         (
#             "cnt",
#             CountTransformer(categorical_cols),  # この行を追加
#             categorical_cols  # カテゴリカル変数に対して適用
#         ),
        (
            "tgt",
            TargetEncoder(target_type="continuous", random_state=88),
            [
                'region',
                "cylinders",
                "manufacturer",
                "condition",
                "fuel",
                "title_status",
                "transmission",
                "drive",
                "size",
                "type",
                "paint_color",
                "state",
            ],
        ),
        *[
            (f"agg_{key}", AggTransformer(key, ["odometer"], {"median"}), [key] + ["odometer"])
            for key in [
#                 "cylinders",
                "manufacturer",
                "condition",
                "paint_color",
#                 "fuel",
#                 "title_status",
#                 "transmission",
#                 "drive",
#                 "size",
#                 "type",
#                 "state",
#                 "year"
            ]
        ],
        
    ],
    verbose=True,
)
ct.set_output(transform="pandas")
pipe = Pipeline(
    steps=[
        ("preprocess", PreProcessTransformer(threshold_year=2000)),
        ("ct", ct),
    ]
)
# train_df_filtered = train_df[train_df['price'] <= 50000].reset_index(drop=True)
# train_feat_df_filtered = pipe.fit_transform(train_df_filtered, train_df_filtered["price"])
# 訓練データに対してパイプラインを適合させる
pipe.fit(train_df, train_df['price'])

train_feat_df = pipe.transform(train_df)
test_feat_df = pipe.transform(test_df)

In [None]:
test_feat_df.info()

In [None]:
train_feat_df.columns

In [None]:
# sns.histplot(data = train_feat_df["ori__odometer"])
# train_feat_df.isnull().sum()
# train_feat_df.info()
# train_feat_df_filtered.info()
train_feat_df

In [None]:
# train_feat_df.query("ori__odometer > 40000 ")
test_df

In [None]:
# test_feat_df["ori__manufacturer"].unique().value_counts()

In [None]:
train_feat_df["ori__title_status"].value_counts()

In [None]:
train_feat_df.info()

In [None]:
import pandas_profiling
train_feat_df.profile_report()

**考察**

odometer　負の値を0に、極端にデカい値は放置

regionからstateの決定は意味ありそう

ori__fuel has 1239 (4.5%) missing values	Missing

ori__title_status has 456 (1.7%) missing values	Missing

ori__type has 456 (1.7%) missing values	Missing

ori__year is highly skewed (γ1 = 20.13225365)

count codingの検討、ダメだった

optunaでcatと統合、ダメだった

交差検証モデル予測の平均の方が精度が高い

最良モデルを再現して年平均価格の評価をすべき、1980年以前ではなく2000年以前を欠損に変更

対数変換、地理的ラベルの検討

In [None]:
import pandas_profiling
test_feat_df.profile_report()

In [None]:
def get_cv(df, n_fold=5):
    kf = KFold(n_splits=n_fold, shuffle=True, random_state=71)
    return list(kf.split(df))

# ref: https://www.guruguru.science/competitions/16/discussions/185c7dc6-5e3a-49c6-9c30-41bf007cc694/
def fit_lgbm(X, y, cv, categorical_cols: list = None, params: dict = None, verbose: int = 50):
    # パラメータがないときは、空の dict で置き換える
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records,), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_train,
            y_train,
            categorical_feature=categorical_cols,
            eval_set=[(x_valid, y_valid)],
            callbacks=[lgb.early_stopping(100, verbose=verbose)],
        )
        pred_i = model.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(model)
        score = mean_absolute_percentage_error(y_valid, pred_i)
        print(f" - fold{i + 1} - {score:.4f}")

    score = mean_absolute_percentage_error(y, oof_pred)

    print("=" * 50)
    print(f"FINISHI: Whole Score: {score:.4f}")
    return score, oof_pred, models

def convert_categorical_features(df, categorical_cols):
    if categorical_cols:
        for col in categorical_cols:
            df[col] = df[col].astype(str)
    return df

def fit_cat(X, y, cv, categorical_cols: list = None, params: dict = None, verbose = False):
    if params is None:
        params = {}
    X = convert_categorical_features(X, categorical_cols)
            
    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records,), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        train_data = cb.Pool(data=x_train, label=y_train, cat_features=categorical_cols)
        valid_data = cb.Pool(data=x_valid, label=y_valid, cat_features=categorical_cols)

        model = cb.CatBoostRegressor(**params)
        model.fit(
            x_train,
            y_train,
            cat_features=categorical_cols,
            eval_set=[(x_valid, y_valid)],
            use_best_model=True,
            early_stopping_rounds=100,
#             verbose=verbose,
        
            
        )
        pred_i = model.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(model)
        score = mean_absolute_percentage_error(y_valid, pred_i)
        print(f" - fold{i + 1} - {score:.4f}")

    score = mean_absolute_percentage_error(y, oof_pred)

    print("=" * 50)
    print(f"FINISHI: Whole Score: {score:.4f}")
    return score, oof_pred, models

# optunaを使ったパラメータチューニング
def tuning_cat(X, y, cv, categorical_cols):
    def objective(trial):
        params = {
#         'iterations': trial.suggest_int('iterations', 100, 2000),
        'iterations' : 10000,
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'learning_rate' : 0.02,
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
#         'border_count': trial.suggest_int('border_count', 32, 255),
        'loss_function': 'MAPE',
        'eval_metric': 'MAPE',
        'verbose': False,
#         'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
#         'od_wait': trial.suggest_int('od_wait', 10, 50)
        
        }

        score, _, _ = fit_cat(X, y, cv=cv, categorical_cols=categorical_cols, params=params, verbose=False)
        return score

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10)
    return study.best_trial.params


# optunaを使ったパラメータチューニング
def tuning_lgbm(train_feat_df, train_df, cv):
    def objective(trial):
        max_depth = trial.suggest_int("max_depth", 4, 10)
        num_leaves = trial.suggest_int("num_leaves", 2, 2**max_depth)
        colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.1, 1.0)
        subsample = trial.suggest_uniform("subsample", 0.1, 1.0)

        params = {
            "objective": "mape",
            "n_estimators": 10000,
            "learning_rate": 0.05,
            "max_depth": max_depth,
            "num_leaves": num_leaves,
            "colsample_bytree": colsample_bytree,
            "subsample": subsample,
            "metric": "mape",
            "importance_type": "gain",
            "random_state": 88,
        }

        score, _, _ = fit_lgbm(
            train_feat_df, train_df["price"], cv=cv, categorical_cols=[], params=params, verbose=-1
        )
        return score

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)
    print("Number of finished trials:", len(study.trials))
    print("Best trial:", study.best_trial.params)
    return study.best_trial.params

In [None]:
def visualize_importance(models, feat_train_df):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_
        _df["column"] = feat_train_df.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = (
        feature_importance_df.groupby("column")
        .sum()[["feature_importance"]]
        .sort_values("feature_importance", ascending=False)
        .index[:50]
    )

    fig, ax = plt.subplots(figsize=(12, max(6, len(order) * 0.25)))
    sns.boxenplot(
        data=feature_importance_df,
        x="feature_importance",
        y="column",
        order=order,
        ax=ax,
        palette="viridis",
        orient="h",
    )
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax


def visualize_oof_gt(oof, gt):
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(oof, gt, alpha=0.5)
    gt_max = gt.max()
    ax.plot(np.arange(0, gt_max), np.arange(0, gt_max), color="red", alpha=0.5, linestyle="--")
    ax.set_xlabel("Out Of Fold")
    ax.set_ylabel("Ground Truth")
    ax.grid()
    ax.legend()
    fig.tight_layout()

    fig, ax


def visualize_oof_pred(oof, pred):
    fig, ax = plt.subplots(figsize=(8, 6))

    bins = 100
    ax.hist(pred, bins=bins, density=True, alpha=0.5, label="Test")
    ax.hist(oof, bins=bins, density=True, alpha=0.5, label="OutOfFold")
    ax.grid()
    ax.legend()
    fig.tight_layout()

    fig, ax




In [None]:
train_feat_df.info()

In [None]:
n_fold = 5
lgbm_params = {
    "objective": "mape",
    "metrics": "mape",
    "n_estimators": 10000,
    "learning_rate": 0.01,
    "max_depth": 5,
    "num_leaves": 32,
    "colsample_bytree": 0.446,
    "subsample": 0.339,
    "importance_type": "gain",
    "random_state": 88,
}

feat_cat_cols = train_feat_df.select_dtypes(include="category").columns.tolist()

cv = get_cv(train_feat_df, n_fold=5)

# training
score_lgbm, oof_lgbm, models_lgbm = fit_lgbm(
    train_feat_df,
    train_df["price"],
    categorical_cols=[],
    params=lgbm_params,
    cv=cv,
    verbose=False,
)

# inference
pred_lgbm = np.array([model.predict(test_feat_df) for model in models_lgbm])
pred_lgbm = np.mean(pred_lgbm, axis=0)

visualize_importance(models_lgbm, train_feat_df)
visualize_oof_gt(oof_lgbm, train_df['price'])
visualize_oof_pred(oof_lgbm, pred_lgbm)


In [None]:
import pandas_profiling
train_feat_df.profile_report()

In [None]:
train_feat_df.profile_report()

In [None]:
# # submission
sub_df["price"] = pred_lgbm
sub_df.to_csv("submission28.csv", index=False, header=False)

In [None]:
visualize_importance(models_lgbm, train_feat_df)
visualize_oof_gt(oof_lgbm, train_df['price'])
visualize_oof_pred(oof_lgbm, pred_lgbm)

In [None]:
# def train_and_validate_with_filtered_data(X_filtered, y_filtered, X_full, y_full, cv, params=None, verbose=False):
#     models = []
#     oof_pred = np.zeros(len(X_full), dtype=np.float32)
    
#     for i, (idx_train, idx_valid) in enumerate(cv):
#         # 除去データで学習
#         x_train, y_train = X_filtered, y_filtered
#         x_valid, y_valid = X_full.iloc[idx_valid], y_full.iloc[idx_valid]

#         model = lgb.LGBMRegressor(**params)
#         model.fit(
#             x_train,
#             y_train,
#             eval_set=[(x_valid, y_valid)],
#             early_stopping_rounds=100,
#             verbose=verbose,
#         )
        
#         pred_i = model.predict(x_valid)
#         oof_pred[idx_valid] = pred_i
#         models.append(model)
#         score = mean_absolute_percentage_error(y_valid, pred_i)
#         print(f" - fold{i + 1} - {score:.4f}")

#     score = mean_absolute_percentage_error(y_full, oof_pred)

#     print("=" * 50)
#     print(f"FINISHI: Whole Score: {score:.4f}")
#     return score, oof_pred, models

# # # 除去データ
# # train_df_filtered = train_df[train_df['price'] <= 50000]
# # train_feat_df_filtered = pipe.transform(train_df_filtered)

# # # 除去していないデータの前処理
# # train_feat_df = pipe.transform(train_df)

# # 固定パラメータ
# params = {
#     "objective": "mape",
#     "n_estimators": 1000,
#     "learning_rate": 0.05,
#     "max_depth": 6,
#     "num_leaves": 31,
#     "random_state": 88,
# }
# lgbm_params = {
#     "objective": "mape",
#     "metrics": "mape",
#     "n_estimators": 10000,
#     "learning_rate": 0.01,
#     "max_depth": 5,
#     "num_leaves": 32,
#     "colsample_bytree": 0.446,
#     "subsample": 0.339,
#     "importance_type": "gain",
#     "random_state": 88,
# }
# # 交差検証
# cv = get_cv(train_df)
# score_lgbm_filter, oof_lgbm_filter, models_lgbm_filter = train_and_validate_with_filtered_data(train_feat_df_filtered, train_df_filtered["price"], train_feat_df, train_df["price"], cv, params=lgbm_params)

# # inference
# pred_lgbm_filter = np.array([model.predict(test_feat_df) for model in models_lgbm_filter])
# pred_lgbm_filter = np.mean(pred_lgbm_filter, axis=0)

In [None]:
visualize_importance(models_lgbm_filter, train_feat_df)
visualize_oof_gt(oof_lgbm_filter, train_df['price'])
visualize_oof_pred(oof_lgbm_filter, pred_lgbm_filter)

In [None]:
# # submission
sub_df["price"] = pred_lgbm_filter
sub_df.to_csv("submission17.csv", index=False, header=False)

In [None]:
def analyze_outliers(n_points=20):
    # 予測値と実際の値との差異を計算（MAPE）
    error_df = pd.DataFrame()
    error_df['prediction'] = oof_lgbm
    error_df['true_price'] = train_df.copy()['price']
    error_df['percentage_error'] = abs(error_df['true_price'] - error_df['prediction']) / error_df['true_price'] * 100 / 27532

    # 差異が大きいデータポイントを取得
    large_error_df = error_df.nlargest(n_points, 'percentage_error')

    # グラフの幅をデータポイントの数に応じて調整
    width = max(15, n_points * 0.05)

    # 差異の大きいデータポイントを可視化
    plt.figure(figsize=(width, 10))
    sns.barplot(x=large_error_df.index, y=large_error_df['percentage_error'], order=large_error_df.index)
    plt.xlabel('Index')
    plt.ylabel('Percentage Error')
    plt.title(f'Top {n_points} Data Points with Largest Prediction Error (MAPE)')
    plt.show()

    # 誤差が大きい順に指定した個数分のtrain_dfに予測値も追加して返す
    top_n_indices = large_error_df.index.values
    top_n_train_df = train_df.iloc[top_n_indices]
    top_n_train_df['prediction'] = large_error_df['prediction'].values

    return top_n_train_df

top_n_train_df_with_predictions = analyze_outliers(n_points=2000)


In [None]:
top_n_train_df_with_predictions.info()

In [None]:
# 分析したいカテゴリカラム
category_columns_to_analyze = [
    'manufacturer',
    'condition',
    'paint_color',
    'state',
    'size',
    'type',
    'transmission'
    ''
    
    
]

# 分析用のデータセットを作成
analysis_df = top_n_train_df_with_predictions[category_columns_to_analyze]

# 元のデータセットと誤差の大きいデータセットの両方でプロット
for col in category_columns_to_analyze:
    fig, axes = plt.subplots(1, 2, figsize=(30, 5))
    
    # 元のデータセットの分布をプロット
    sns.countplot(data=train_df, x=col, order=train_df[col].value_counts().index, ax=axes[0])
    axes[0].set_title(f'Distribution of {col} in Original Data')
    axes[0].tick_params(axis='x', rotation=45)
    
    # 誤差の大きいデータセットの分布をプロット
    sns.countplot(data=top_n_train_df_with_predictions, x=col, order=top_n_train_df_with_predictions[col].value_counts().index, ax=axes[1])
    axes[1].set_title(f'Distribution of {col} in Top {len(top_n_train_df_with_predictions)} Data Points with Largest Prediction Error')
    axes[1].tick_params(axis='x', rotation=45)

    plt.show()



In [None]:
top_n_train_df_with_predictions.query("price / prediction > 1")
# top_n_train_df_with_predictions.query("price >  10000")

In [None]:
negative_difference_df = top_n_train_df_with_predictions[top_n_train_df_with_predictions['prediction'] - top_n_train_df_with_predictions['price'] > 0]
negative_difference_df

In [None]:
def analyze_smallest_outliers(n_points=20):
    # 予測値と実際の値との差異を計算（MAPE）
    error_df = pd.DataFrame()
    error_df['prediction'] = oof_pred
    error_df['true_price'] = train_df.copy()['price']
    error_df['percentage_error'] = abs(error_df['true_price'] - error_df['prediction']) / error_df['true_price'] * 100 / 27532

    # 差異が小さいデータポイントを取得
    small_error_df = error_df.nsmallest(n_points, 'percentage_error')

    # グラフの幅をデータポイントの数に応じて調整
    width = max(15, n_points * 0.05)

    # 差異の小さいデータポイントを可視化
    plt.figure(figsize=(width, 5))
    sns.barplot(x=small_error_df.index, y=small_error_df['percentage_error'], order=small_error_df.index)
    plt.xlabel('Index')
    plt.ylabel('Percentage Error')
    plt.title(f'Top {n_points} Data Points with Smallest Prediction Error (MAPE)')
    plt.show()

    # 誤差が小さい順に指定した個数分のtrain_dfに予測値も追加して返す
    top_n_indices = small_error_df.index.values
    top_n_train_df = train_df.iloc[top_n_indices]
    top_n_train_df['prediction'] = small_error_df['prediction'].values

    return top_n_train_df

top_n_train_df_with_small_predictions = analyze_smallest_outliers(n_points=2000)


In [None]:
top_n_train_df_with_small_predictions.query("price < 10000")

In [None]:
negative_difference_df_small = top_n_train_df_with_small_predictions[top_n_train_df_with_small_predictions['prediction'] - top_n_train_df_with_small_predictions['price'] < 0]
negative_difference_df_small

In [None]:
import pandas_profiling
top_n_train_df_with_predictions.profile_report()

In [None]:

top_n_train_df_with_small_predictions.profile_report()

In [None]:
train_df.query("price < 3000")

In [None]:
def analyze_price_mape_relationship(bins=10):
    # 予測値と実際の値との差異を計算（MAPE）
    error_df = pd.DataFrame()
    error_df['prediction'] = oof_lgbm
    error_df['true_price'] = train_df.copy()['price']
    error_df['percentage_error'] = abs(error_df['true_price'] - error_df['prediction']) / error_df['true_price'] * 100

    # 価格帯の範囲をデータ点の個数で均等に分割
    price_bins = pd.qcut(error_df['true_price'], q=bins)
    error_df['price_bin'] = price_bins
    price_error_grouped = error_df.groupby('price_bin')['percentage_error'].agg(['mean', 'std']).reset_index()

    # グラフ描画
    plt.figure(figsize=(15, 5))
    sns.barplot(x='price_bin', y='mean', data=price_error_grouped, yerr=price_error_grouped['std'])
    plt.xlabel('Price Bin')
    plt.ylabel('Average MAPE')
    plt.title('Relationship between Price Band and Prediction Error (MAPE)')
    plt.xticks(rotation=45)
    plt.show()

analyze_price_mape_relationship(bins=20)


In [None]:
def analyze_price_mape_relationship_boxplot(bins=10):
    # 予測値と実際の値との差異を計算（MAPE）
    error_df = pd.DataFrame()
    error_df['prediction'] = oof_lgbm
    error_df['true_price'] = train_df.copy()['price']
    error_df['percentage_error'] = abs(error_df['true_price'] - error_df['prediction']) / error_df['true_price'] * 100

    # 価格帯の範囲をデータ点の個数で均等に分割
    price_bins = pd.qcut(error_df['true_price'], q=bins)
    error_df['price_bin'] = price_bins

    # 箱ひげ図描画
    plt.figure(figsize=(15, 5))
    sns.boxplot(x='price_bin', y='percentage_error', data=error_df)
    plt.xlabel('Price Bin')
    plt.ylabel('MAPE')
    plt.title('Relationship between Price Band and Prediction Error (MAPE)')
    plt.xticks(rotation=45)
    plt.show()

analyze_price_mape_relationship_boxplot(bins=20)


In [None]:
# train_feat_df.info()
def plot_boxplot_for_categories(df, category_columns, threshold=20):
    # 予測値と実際の値との差異を計算（MAPE）
    error_df = pd.DataFrame()
    error_df['prediction'] = oof_lgbm
    error_df['true_price'] = train_df['price']
    error_df['percentage_error'] = abs(error_df['true_price'] - error_df['prediction']) / error_df['true_price'] * 100

    # 指定したカテゴリ変数を追加
    for col in category_columns:
        error_df[col] = df[col]

    # 各カテゴリ変数に対して箱ひげ図を描画
    for col in category_columns:
        # カテゴリのデータ数を取得
        category_counts = error_df[col].value_counts()

        # カテゴリラベルにデータ数を追加
        error_df[col + '_label'] = error_df[col].apply(lambda x: str(x) + f' (n={category_counts[x]})')

        if col != 'ori__year':
        # カテゴリをデータ数が多い順に並び替え
            category_order = category_counts.index.tolist()
            error_df[col + '_label'] = pd.Categorical(error_df[col + '_label'], categories=[str(cat) + f' (n={category_counts[cat]})' for cat in category_order], ordered=True)
            
        # 'year'の場合、カテゴリを数値に変換して並び替え
        if col == 'ori__year':
            # カテゴリラベルの一意なリストを作成
            unique_labels = sorted(set(error_df[col + '_label']), key=lambda x: int(float(x.split(' ')[0])))
            
            # 一意なリストを順序付きカテゴリとして使用
            error_df[col + '_label'] = pd.Categorical(
                error_df[col + '_label'],
                categories=unique_labels,
                ordered=True
            )
        if len(category_counts) > threshold:
            plt.figure(figsize=(10, 15))
            sns.boxplot(x='percentage_error', y=col + '_label', data=error_df, orient='h')
            # 平均値を描画
            sns.pointplot(x='percentage_error', y=col + '_label', data=error_df, color='red', markers='o', linestyles='--', join=False, orient='h') # orient='h'で水平に描画
            plt.ylabel(col)
            plt.xlabel('MAPE')
            plt.xlim(0, 80)
        else:
            plt.figure(figsize=(15, 5))
            sns.boxplot(x=col + '_label', y='percentage_error', data=error_df)
            # 平均値を描画
            sns.pointplot(x=col + '_label', y='percentage_error', data=error_df, color='red', markers='o', linestyles='--', join=False)
            plt.xlabel(col)
            plt.ylabel('MAPE')
            plt.ylim(0, 80)
        
        plt.title(f'Relationship between {col} and Prediction Error (MAPE)')
        plt.xticks(rotation=45 if len(category_counts) <= threshold else 0)
        plt.show()
# 例: 'ori__odometer'を10のビンに分割
bins = 20
train_feat_df_23 = train_feat_df.copy()
train_feat_df_23['ori__odometer_binned'] = pd.qcut(train_feat_df_23['ori__odometer'], q=bins)

category_columns = [
    'ori__cylinders',
    'ori__manufacturer',
    'ori__condition',
    'ori__fuel',
    'ori__title_status',
    'ori__transmission',
    'ori__drive',
    'ori__size',
    'ori__type',
    'ori__paint_color',
    'ori__state',
    'ori__year',
    'ori__odometer_binned'
]
plot_boxplot_for_categories(train_feat_df_23, category_columns)


In [None]:
# # 最良の特徴量セットで全訓練データを使用して訓練
final_model = lgb.LGBMRegressor(**lgbm_params)
final_model.fit(
    train_feat_df,
    train_df["price"],
    categorical_feature=[],
)
#これを推論に使ってはいけない



In [None]:
# # 訓練データ全体での評価
pred_lgbm_train = final_model.predict(train_feat_df)
whole_score = mean_absolute_percentage_error(train_df["price"], pred_lgbm_train)
print(f"Whole Training Data MAPE: {whole_score:.4f}")

In [None]:
def optimize_features_by_importance_and_cv(X, y, cv, params=None, verbose=False):
    if params is None:
        params = {}

    # 初期設定
    features = list(X.columns)
    best_score = float('inf')
    best_features = features.copy()

    while len(features) > 1:
        # 特徴量セットでのモデル訓練
        score, _, models = fit_lgbm(X[features], y, cv, params=params, verbose=verbose)

        # 現在のスコアが最良なら更新
        if score < best_score:
            best_score = score
            best_features = features.copy()

        # 特徴量重要度の計算
        feature_importance = np.mean([model.feature_importances_ for model in models], axis=0)

        # 最も重要度が低い特徴量の削除
        least_important_idx = np.argmin(feature_importance)
        del features[least_important_idx]

        if verbose:
            print(f"Removed feature: {features[least_important_idx]}. Number of features left: {len(features)}")

    if verbose:
        print(f"Best score: {best_score}. Number of best features: {len(best_features)}")

    return best_features

# 特徴量の最適化を実行
best_features = optimize_features_by_importance_and_cv(train_feat_df, train_df["price"], cv, params=lgbm_params, verbose=True)

# 最良の特徴量セットで再訓練
score_lgbm, oof_lgbm, models_lgbm = fit_lgbm(
    train_feat_df[best_features],
    train_df["price"],
    categorical_cols=[],
    params=lgbm_params,
    cv=cv,
    verbose=False,
)

# 推論
pred_lgbm = np.array([model.predict(test_feat_df[best_features]) for model in models_lgbm])
pred_lgbm = np.mean(pred_lgbm, axis=0)


In [None]:
# # n_fold = 5
# # lgbm_params = {
# #     "objective": "mape",
# #     "metrics": "mape",
# #     "n_estimators": 10000,
# #     "learning_rate": 0.01,
# #     "max_depth": 5,
# #     "num_leaves": 32,
# #     "colsample_bytree": 0.446,
# #     "subsample": 0.339,
# #     "importance_type": "gain",
# #     "random_state": 88,
# # }

# feat_cat_cols = train_feat_df.select_dtypes(include="category").columns.tolist()

# cv = get_cv(train_feat_df, n_fold=5)

# #optuna
# best_params_lgbm = tuning_lgbm(
#     train_feat_df.copy(),
#     train_df,
#     cv=cv,
# #     categorical_cols = feat_cat_cols
# )
# # training
# score_lgbm, oof_lgbm, models_lgbm = fit_lgbm(
#     train_feat_df,
#     y=train_df["price"],
#     categorical_cols=[],
#     params=best_params_lgbm,
#     cv=cv,
#     verbose=False,
# )

# # inference
# pred_lgbm = np.array([model.predict(test_feat_df) for model in models_lgbm])
# pred_lgbm = np.mean(pred_lgbm, axis=0)

In [None]:

# training
best_params_cat = tuning_cat(
    train_feat_df.copy(),
    y=train_df["price"],
    cv=cv,
     categorical_cols = feat_cat_cols
)

score_cat, oof_cat, models_cat = fit_cat(
    X, y, cv=cv, categorical_cols=feat_cat_cols, params=best_params_cat, verbose=False)

# テストデータの予測
test_feat_df = convert_categorical_features(test_feat_df, feat_cat_cols)

# inference
pred_cat = np.array([model.predict(test_feat_df) for model in models_cat])
pred_cat = np.mean(pred_cat, axis=0)

pred = (pred_lgbm + pred_cat)/2

In [None]:
feat_cat_cols

In [None]:
train_feat_df.info()

In [None]:
# One-Hotエンコーディング
state_dummies = pd.get_dummies(train_df['state'], prefix='state')
train_df = pd.concat([train_df, state_dummies], axis=1)

In [None]:


# t-SNEの実行
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(train_feat_df)

# 可視化
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=state_labels, cmap='viridis')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(label='State')
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

def preprocess_and_visualize(df):
    # 0. stateカラムのエンコーディング
    le = LabelEncoder()
    df['ori__manufacturer'] = le.fit_transform(df['ori__manufacturer'])

    # 1. 他のカテゴリカルデータのエンコーディング
    df_dummies = pd.get_dummies(df, columns=df.select_dtypes(include='category').columns.tolist())

    # 2. 欠損値の処理
    df_dummies.fillna(df_dummies.median(numeric_only=True), inplace=True)

    # 3. 標準化
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_dummies)

    # 4. 次元削減
    pca = PCA(n_components=2) # 2次元に削減
    df_pca = pca.fit_transform(df_scaled)

    # 可視化
    plt.figure(figsize=(10, 8))
    plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['ori__manufacturer'], cmap='viridis')
    plt.title('PCA Visualization by manufacturer')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(label='manufacturer')
    plt.show()

# 関数を呼び出して可視化
preprocess_and_visualize(train_feat_df.copy())


In [None]:
# train_feat_df.drop(["price"], axis=1)
train_feat_df.info()

In [None]:
from scipy.optimize import minimize

# 重みを最適化する関数
def ensemble_weights_optimization(oof_preds, y):
    def ensemble_error(weights):
        ensemble_pred = sum([weight * oof_pred for weight, oof_pred in zip(weights, oof_preds)])
        return mean_absolute_percentage_error(y, ensemble_pred)

    cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
    bounds = [(0, 1)] * len(oof_preds)
    result = minimize(ensemble_error, [1 / len(oof_preds)] * len(oof_preds), method='SLSQP', bounds=bounds, constraints=cons)
    return result.x

# アンサンブルのメイン関数
def ensemble_lgbm(train_feat_df, test_feat_df, y, cv, seed_range, max_depth_range, num_leaves_range, colsample_bytree_range, subsample_range):
    oof_preds_by_seed = []
    test_preds_by_seed = []

    for seed in seed_range:
        oof_preds = []
        test_preds = []
        for max_depth in max_depth_range:
            for num_leaves in num_leaves_range:
                for colsample_bytree in colsample_bytree_range:
                    for subsample in subsample_range:
                        params = {
                            "objective": "mape",
                            "metrics": "mape",
                            "n_estimators": 10000,
                            "learning_rate": 0.01,
                            "max_depth": max_depth,
                            "num_leaves": num_leaves,
                            "colsample_bytree": colsample_bytree,
                            "subsample": subsample,
                            "importance_type": "gain",
                            "random_state": seed,
                        }
                        score_lgbm, oof_lgbm, models_lgbm = fit_lgbm(
                            train_feat_df,
                            y=y,
                            categorical_cols=[],
                            params=params,
                            cv=cv,
                            verbose=False,
                        )
                        pred_lgbm = np.array([model.predict(test_feat_df) for model in models_lgbm])
                        oof_preds.append(oof_lgbm)
                        test_preds.append(np.mean(pred_lgbm, axis=0))

        # このseedでの最適な重みでアンサンブル
        weights = ensemble_weights_optimization(oof_preds, y)
        final_oof_pred_seed = np.zeros_like(oof_preds[0])
        final_test_pred_seed = np.zeros_like(test_preds[0])
        for weight, oof_pred, test_pred in zip(weights, oof_preds, test_preds):
            final_oof_pred_seed += weight * oof_pred
            final_test_pred_seed += weight * test_pred

        oof_preds_by_seed.append(final_oof_pred_seed)
        test_preds_by_seed.append(final_test_pred_seed)

    # seedに対して平均を取る
    final_oof_pred = np.mean(oof_preds_by_seed, axis=0)
    final_test_pred = np.mean(test_preds_by_seed, axis=0)

    return final_oof_pred, final_test_pred

# ハイパーパラメータの範囲
seed_range = [88, 42]#, 123
max_depth_range = [5, 6, 7]
num_leaves_range = [32]#, 64
colsample_bytree_range = [0.4, 0.5]
subsample_range = [0.3, 0.4]

# アンサンブル関数の呼び出し
final_oof_pred, final_test_pred = ensemble_lgbm(
    train_feat_df,
    test_feat_df,
    y=train_df["price"],
    cv=cv,
    seed_range=seed_range,
    max_depth_range=max_depth_range,
    num_leaves_range=num_leaves_range,
    colsample_bytree_range=colsample_bytree_range,
    subsample_range=subsample_range
)

# 結果の確認
score = mean_absolute_percentage_error(train_df["price"], final_oof_pred)
print(f"Final Ensemble Score: {score:.4f}")

In [None]:
# from scipy.optimize import minimize

# def ensemble_error(weights):
#     ensemble_pred = weights[0] * oof_cat + weights[1] * oof_lgbm
#     return mean_absolute_percentage_error(train_df["price"], ensemble_pred)

# cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
# bounds = [(0, 1)] * 2

# # 初期値の候補を用意
# initial_weights_candidates = [
#     [0.1, 0.9], [0.3, 0.7], [0.5, 0.5], [0.7, 0.3], [0.9, 0.1],
# #     [0.02, 0.98], [0.04, 0.96], 
#     [0.06, 0.94], [0.08, 0.92], [0.12, 0.88]]

# # 最良の結果を保存する変数
# best_result = None
# best_error = float('inf')

# # 初期値ごとに最適化を行う
# for initial_weights in initial_weights_candidates:
#     result = minimize(ensemble_error, initial_weights, method='SLSQP', bounds=bounds, constraints=cons)
#     print(f"Initial weights: {initial_weights}, MAPE: {result.fun}, Success: {result.success}")
#     if result.success and result.fun < best_error:
#         best_error = result.fun
#         best_result = result

# # 最良の結果を表示
# print("Best result:")
# print(best_result)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def visualize_by_category(df):
    # 数値変数だけを取り出す
    numerical_df = df.select_dtypes(exclude='category')

    # 欠損値の処理
    numerical_df.fillna(numerical_df.median(), inplace=True)

    # 標準化
    scaler = StandardScaler()
    numerical_scaled = scaler.fit_transform(numerical_df)

    # カテゴリカル変数のリストを取得
    categorical_columns = df.select_dtypes(include='category').columns.tolist()

    # カテゴリカル変数ごとに処理
    for col in categorical_columns:
        # 次元削減
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(numerical_scaled)

        # カテゴリカル変数の値を取り出す
        category_labels = df[col].astype(str)

        # 可視化用のデータフレームを作成
        plot_df = pd.DataFrame({
            'Principal Component 1': pca_result[:, 0],
            'Principal Component 2': pca_result[:, 1],
            col: category_labels
        })

        # 可視化
        plt.figure(figsize=(10, 8))
        sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue=col, data=plot_df, palette='viridis')
        plt.title(f'PCA Visualization by {col}')
        plt.show()

# 関数を呼び出して可視化
visualize_by_category(train_feat_df)
