# ベースラインモデル（線形回帰）

このノートブックでは、前処理済みデータを用いて線形回帰によるベースラインモデルを構築し、評価します。

- 目的変数: `price_actual`
- モデル: 線形回帰（LinearRegression）
- 評価指標: RMSE


## 1. ライブラリのインポートとデータ読み込み

In [1]:
# LightGBM特有のエラー対策
#!brew install libomp
#!uv pip uninstall lightgbm

In [2]:
!uv pip install lightgbm
import lightgbm as lgb

[2mAudited [1m1 package[0m [2min 20ms[0m[0m


In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
import japanize_matplotlib

# データディレクトリ
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data'
print(DATA_DIR)
# 前処理済みデータの読み込み
train = pd.read_csv(DATA_DIR / 'train_processed.csv', index_col=0)
test = pd.read_csv(DATA_DIR / 'test_processed.csv', index_col=0)

print('train shape:', train.shape)
print('test shape:', test.shape)

/Users/m0122wt/Desktop/02.プライベート/01.ノウハウ/07.データ分析/notebook/signate_smbc_202506/data
train shape: (26280, 113)
test shape: (8760, 112)


## 2. 特徴量・目的変数の設定

In [4]:
# 目的変数
target_col = 'price_actual'

# 説明変数（目的変数とtime列以外）
drop_cols = ['time', target_col] if target_col in train.columns else ['time']
feature_cols = [col for col in train.columns if col not in drop_cols]

X = train[feature_cols]
y = train[target_col] if target_col in train.columns else train.iloc[:, -1]  # 念のため

print('Features:', feature_cols)
print('Target:', target_col)
print('X shape:', X.shape)
print('y shape:', y.shape)

Features: ['generation_fossil_brown_coal/lignite', 'generation_fossil_gas', 'generation_fossil_hard_coal', 'generation_fossil_oil', 'generation_hydro_pumped_storage_consumption', 'generation_hydro_run_of_river_and_poundage', 'generation_hydro_water_reservoir', 'generation_nuclear', 'generation_other', 'generation_other_renewable', 'generation_solar', 'generation_waste', 'generation_wind_onshore', 'total_load_actual', 'valencia_temp', 'valencia_temp_min', 'valencia_temp_max', 'valencia_pressure', 'valencia_humidity', 'valencia_wind_speed', 'valencia_wind_deg', 'valencia_rain_1h', 'valencia_rain_3h', 'valencia_snow_3h', 'valencia_clouds_all', 'valencia_weather_id', 'valencia_weather_main', 'valencia_weather_description', 'valencia_weather_icon', 'madrid_temp', 'madrid_temp_min', 'madrid_temp_max', 'madrid_pressure', 'madrid_humidity', 'madrid_wind_speed', 'madrid_wind_deg', 'madrid_rain_1h', 'madrid_rain_3h', 'madrid_snow_3h', 'madrid_clouds_all', 'madrid_weather_id', 'madrid_weather_mai

## 3. 学習・検証データ分割

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train:', X_train.shape, 'X_valid:', X_valid.shape)

# LightGBMで学習するためのデータ形式に変換
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

X_train: (21024, 111) X_valid: (5256, 111)


## 4. 線形回帰モデルの学習と予測

In [6]:
#モデルパラメータの設定
params = {'metric' : 'rmse'}
model = lgb.train(params, dtrain)

# 検証データで予測
y_pred = model.predict(X_valid)

# RMSEで評価
rmse = root_mean_squared_error(y_valid, y_pred)
print('Validation RMSE:', rmse)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13051
[LightGBM] [Info] Number of data points in the train set: 21024, number of used features: 95
[LightGBM] [Info] Start training from score 56.098172
Validation RMSE: 4.178685995274353


## 5. テストデータへの予測と保存

In [7]:
test.index

Index([-1.3311267651068943, -1.2978361024799772, -1.2867392149376713,
        -1.320029877564589,  -1.253448552310754,  -1.320029877564589,
        -1.253448552310754,  -1.320029877564589, -1.2867392149376713,
       -1.2756423273953656,
       ...
        -1.364417427733812, -1.3866112028184234,  -1.353320540191506,
       -1.4531925280722582, -1.5419676284107044, -1.4753863031568697,
       -1.3755143152761176, -1.3755143152761176, -1.2867392149376713,
       -1.2756423273953656],
      dtype='float64', name='generation_biomass', length=8760)

In [10]:
# テストデータの予測
X_test = test[feature_cols]
test_pred = model.predict(X_test)

# 予測結果の保存
submission = test[['time']].copy()
submission['price_actual_pred'] = test_pred
submission.to_csv(DATA_DIR / 'submission_baseline.csv', index=False, header=False)
print('Saved: submission_baseline.csv')

Saved: submission_baseline.csv


In [None]:
submission