In [6]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

# 特徴量と目的変数に分ける
X_train, X_test, y_train, y_test = train_test_split(train.drop("price_range", axis=1), train["price_range"], test_size=0.2, random_state=1)

# ランダムフォレストのモデルをフィット
rf_model = RandomForestClassifier(n_estimators=200, random_state=1)
rf_model.fit(X_train, y_train)

# 特徴量の選択
sfm = SelectFromModel(rf_model)
X_train_sfm = sfm.fit_transform(X_train, y_train)
X_test_sfm = sfm.transform(X_test)

# 特徴量のスケーリング
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sfm)
X_test_scaled = scaler.transform(X_test_sfm)

# 新しい特徴量の生成
X_train_new = np.c_[X_train_scaled, X_train_sfm[:, 0]*X_train_sfm[:, 1]]
X_test_new = np.c_[X_test_scaled, X_test_sfm[:, 0]*X_test_sfm[:, 1]]

# モデルのアンサンブル
rf_model_new = RandomForestClassifier(n_estimators=200, random_state=1)
xgb_model = XGBClassifier(n_estimators=200, random_state=1)
voting_model = VotingClassifier(estimators=[('rf_new', rf_model_new), ('xgb', xgb_model)], voting='soft')
voting_model.fit(X_train_new, y_train)

# テストデータの予測
test_sfm = sfm.transform(test)
test_scaled = scaler.transform(test_sfm)
test_new = np.c_[test_scaled, test_sfm[:, 0]*test_sfm[:, 1]]
y_pred = voting_model.predict(test_new)

# 予測結果の出力
output = pd.DataFrame({"id": test["id"], "price_range": y_pred})
output.to_csv("./submission/submission_rf-xgb_v1.csv", index=False,header=False)

In [5]:
output

Unnamed: 0,id,price_range
0,0,1
1,1,0
2,2,3
3,3,1
4,4,0
...,...,...
795,795,0
796,796,1
797,797,3
798,798,2
