In [5]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

# 特徴量と目的変数に分ける
X_train, X_test, y_train, y_test = train_test_split(train.drop("price_range", axis=1), train["price_range"], test_size=0.2, random_state=1)

# ランダムフォレストのモデルをフィット
rf_model = RandomForestClassifier(n_estimators=200, random_state=1)
rf_model.fit(X_train, y_train)

# 特徴量の選択
sfm = SelectFromModel(rf_model)
X_train_sfm = sfm.fit_transform(X_train, y_train)
X_test_sfm = sfm.transform(X_test)

# 特徴量のスケーリング
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sfm)
X_test_scaled = scaler.transform(X_test_sfm)

# 新しい特徴量の生成
X_train_new = np.c_[X_train_scaled, X_train_sfm[:, 0]*X_train_sfm[:, 1]]
X_test_new = np.c_[X_test_scaled, X_test_sfm[:, 0]*X_test_sfm[:, 1]]

# モデルのアンサンブル
rf_model_new = RandomForestClassifier(n_estimators=200, random_state=1)
xgb_model = XGBClassifier(n_estimators=200, random_state=1)
voting_model = VotingClassifier(estimators=[('rf_new', rf_model_new), ('xgb', xgb_model)], voting='soft')
voting_model.fit(X_train_new, y_train)

# 予測と評価
y_pred = voting_model.predict(X_test_new)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 予測結果の出力
output = pd.DataFrame({"id": test.index, "price_range": y_pred})
output.to_csv("./submission/submission_rf-xgb_v1.csv", index=False)

Accuracy: 0.45
Confusion Matrix:
 [[17  8  5 10]
 [ 9 24 11 25]
 [ 5  9 50  9]
 [11 20 10 17]]
[[ 1.16486040e-01  6.39028983e-01  1.20300454e-01 ... -6.92709305e-01
  -1.43111017e+00  1.88105389e+03]
 [ 2.44984438e-01 -9.37539856e-01  1.37934188e+00 ... -1.16085498e+00
   7.05237496e-01  5.79064058e+02]
 [ 1.22608707e+00 -9.37531153e-01  1.37934188e+00 ... -1.16085498e+00
  -6.54256472e-01  8.60876992e+02]
 ...
 [ 1.54906953e+00  1.75967216e+00 -8.52595192e-01 ...  1.17987340e+00
   1.22597224e-01  4.98273555e+03]
 [ 8.16281369e-01  1.66763814e+00 -1.59657422e+00 ... -6.92709305e-01
   1.28787777e+00  3.77585245e+03]
 [-6.00673936e-01 -9.37562302e-01  1.37934188e+00 ... -1.16085498e+00
  -6.54256472e-01  3.36154567e+02]]


ValueError: All arrays must be of the same length

ValueError: array length 240 does not match index length 800

In [2]:
output

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
