In [18]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-win_amd64.whl (125.4 MB)


ERROR: THESE PACKAGES DO NOT MATCH THE HASHES FROM THE REQUIREMENTS FILE. If you have updated the package versions, please update the hashes. Otherwise, examine the package contents carefully; someone may have tampered with them.
    xgboost from https://files.pythonhosted.org/packages/66/88/11cbea9c7a4ebae26c16ef20c14bfca3fa5aded4b0de27ccd8482429e7bf/xgboost-1.6.1-py3-none-win_amd64.whl#sha256=3adcb7e4ccf774d5e0128c01e5c381303c3799910ab0f2e996160fe3cd23b7fc:
        Expected sha256 3adcb7e4ccf774d5e0128c01e5c381303c3799910ab0f2e996160fe3cd23b7fc
             Got        e46c3c309b116b734d8f6fab11a4fe49fd44fbb1e18d1f7ebc6edc8badd8f36c



In [19]:
import numpy as np 
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import RandomOverSampler
from matplotlib import pyplot as plt

ModuleNotFoundError: No module named 'xgboost'

In [1]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

## Read Training Data

In [None]:
data = pd.read_csv("train.csv", decimal=",")
ds = data.values
ds

In [None]:
x_trdata = ds[:, 1:94]
y_trdata = ds[:, 94]

In [None]:
le = LabelEncoder()
y_trdata = le.fit_transform(y_trdata).astype(int)
y_trdata

In [None]:
seed = 326
ts = 0.20

sss = StratifiedShuffleSplit(n_splits=1, test_size=ts, random_state=seed)
for train_idx, test_idx in sss.split(x_trdata, y_trdata):
    x_train = x_trdata[train_idx]
    y_train = y_trdata[train_idx]
    x_val = x_trdata[test_idx]
    y_val = y_trdata[test_idx]  

## Quick Check on Data

In [None]:
data.isnull().sum().sum()

In [None]:
data["target"].value_counts().plot.bar()

In [None]:
display_all(data.describe())

## XGBoost Baseline

In [None]:
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(x_train, y_train)

In [None]:
pred = xgb.predict_proba(x_val)
print("Validation data logloss: {}".format(log_loss(y_val, pred)))

In [None]:
pred = xgb.predict(x_val)
accu = accuracy_score(y_val, pred)
print("Accuracy: %.2f%%" % (accu * 100.0))

## XGBoost Parameter Tuning

#### learning_rate

In [None]:
lr_scores = []
learning_rates = [0.345, 0.350, 0.355]

for l in learning_rates:
    tune_xgb = XGBClassifier(use_label_encoder=False, learning_rate=l)
    tune_xgb.fit(x_train, y_train)
    pred = tune_xgb.predict_proba(x_val)
    lr_scores.append(log_loss(y_val, pred))
    print("learning_rate=%.3f, logloss: %.5f" % (l, log_loss(y_val, pred)))

In [None]:
plt.plot(learning_rates, lr_scores, 'o-')
plt.ylabel(log_loss)
plt.xlabel("learning_rate")
print("best learning_rate {}".format(learning_rates[np.argmin(lr_scores)]))

## XGBoost Fine-tune Params

In [None]:
pa = {'objective':'multi:sofprob',
      'learning_rate': 0.35,
     }

In [None]:
xgb = XGBClassifier(**pa, use_label_encoder=False)
model = CalibratedClassifierCV(xgb, cv=5, method="isotonic")
model.fit(x_train, y_train)

In [None]:
pred = model.predict_proba(x_val)
print("Validation data logloss: {}".format(log_loss(y_val, pred)))

## Predict with Test Data

In [None]:
test = pd.read_csv("../input/otto-group-product-classification-challenge/test.csv", decimal=",")
ds = test.values
ds

In [None]:
test.shape

In [None]:
x_test = ds[:, 1:94]
x_test

In [None]:
pred = model.predict_proba(x_test)
pred

In [None]:
output = pd.DataFrame(pred, columns=["Class_1","Class_2","Class_3","Class_4","Class_5","Class_6","Class_7","Class_8","Class_9"])
output.index.name = "id"
output.index += 1
output

In [None]:
output.to_csv("./otto_submission.csv")