In [None]:
!pip install numpy pandas scikit-learn xgboost

In [None]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from xgboost import XGBClassifier

In [None]:
# These values are produced via Bayesian hyperparameter optimization in Azure
# AutoML.
models_parameters = {
    "colsample_bylevel": [1, 1, 1, 1, 0.9, 1, 1, 1],
    "colsample_bytree": [0.6, 0.5, 0.7, 0.7, 1, 0.8, 0.9, 0.5],
    "eta": [0.3, 0.4, 0.4, 0.3, 0.2, 0.3, 0.3, 0.5],
    "gamma": [1, 1, 0.1, 0, 0.01, 0, 0, 0],
    "learning_rate": [0.3, 0.4, 0.4, 0.3, 0.2, 0.3, 0.3, 0.5],
    "max_depth": [7, 9, 6, 10, 6, 9, 7, 9],
    "max_leaves": [31, 127, 31, 31, 0, 7, 0, 3],
    "n_estimators": [600, 600, 400, 800, 600, 400, 400, 400],
    "reg_alpha": [0.2083, 0, 2.1875, 1.25, 2.2916, 0, 1.77083, 2.5],
    "reg_lambda": [0.4167, 2.5, 1.25, 0.52083, 1.6667, 1.35417, 0.83, 0.3125],
    "subsample": [1, 1, 1, 1, 1, 0.6, 0.9, 0.5],
    "tree_method": ["auto", "auto", "auto", "hist", "auto", "auto", "auto", "auto"],
    "weights": [2/11, 2/11, 2/11, 1/11, 1/11, 1/11, 1/11, 1/11]
}

def create_model(kwargs):
  return XGBClassifier(
      base_score=0.5,
      importance_type='gain',
      objective='multi:softprob',
      random_state=42,
      validate_parameters=1,
      verbosity=3,
      **kwargs
  )

def create_ensemble():
  estimators = []

  for i in range(8):
    kwargs = {}

    for key in models_parameters:
      kwargs[key] = models_parameters[key][i]
    
    model = create_model(kwargs)
    estimators.append((f"model_{i}", model))

  return VotingClassifier(
    estimators=estimators,
    flatten_transform=False,
    verbose=True,
    voting="soft",
    weights=models_parameters["weights"]
  )

model = create_ensemble()

In [None]:
# Dataset Preprocessing

X = pd.read_csv("train_values.csv", index_col="building_id")
y = pd.read_csv("train_labels.csv", index_col="building_id")

X_train, X_validation, y_train, y_validation = train_test_split(X,
                                                                y,
                                                                stratify=y,
                                                                test_size=0.2)

categorical_features = X.select_dtypes(include=object).columns.tolist()
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

preprocessor = ColumnTransformer(
    transformers = [
        ("categorical", OneHotEncoder(), categorical_features),
        ("numerical", StandardScaler(), numerical_features)
    ],
    remainder="passthrough"
)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_validation_preprocessed = preprocessor.transform(X_validation)

In [None]:
# Train the model.
model.fit(X_train_preprocessed, y_train.values.flatten())

# Evaluate the model performance on the validation dataset.
f1_score(y_validation.values.flatten(), model.predict(X_validation_preprocessed), average="micro")

In [7]:
# Create submission.

X_test = pd.read_csv("test_values.csv", index_col="building_id")
X_test_preprocessed = preprocessor.transform(X_test)

predictions = model.predict(X_test_preprocessed)

submission_format = pd.read_csv("submission_format.csv", index_col='building_id')

submission = pd.DataFrame(
    data=predictions,
    columns=submission_format.columns,
    index=submission_format.index
)

submission.to_csv("submission.csv", index=False)