<a href="https://colab.research.google.com/github/wileyw/DeepLearningDemos/blob/master/TabularXGBoost/TabularDataXGBoostTutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Update to latest xgboost
!python -m pip install --upgrade xgboost

# Toy tutorial (https://www.datacamp.com/tutorial/xgboost-in-python)

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.keys())

In [None]:
print(boston.data.shape)

In [None]:
print(boston.feature_names)

In [None]:
print(boston.DESCR)

In [None]:
import pandas as pd

data = pd.DataFrame(boston.data)
data.columns = boston.feature_names

In [None]:
data['PRICE'] = boston.target

In [None]:
data.info()

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [None]:
X, y = data.iloc[:,:-1],data.iloc[:,-1]
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, booster='gblinear')

In [None]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
# Run k-fold validation
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [None]:
import matplotlib.pyplot as plt
xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [30, 30]
plt.show()

In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

# XGBoost Eye Movement Dataset

In [None]:
# Download eye movements dataset (https://www.openml.org/search?type=data&sort=runs&id=1044&status=active )
!wget https://www.openml.org/data/download/53927/eye_movements.arff

In [None]:
import xgboost as xgb
xgb.__version__

In [None]:
# Load data
from scipy.io import arff
import pandas as pd
data = arff.loadarff('eye_movements.arff')
df = pd.DataFrame(data[0])

In [None]:
df.head()

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [None]:
print(df.dtypes)

In [None]:
df['P1stFixation'] = df['P1stFixation'].astype(float)
df['P2stFixation'] = df['P2stFixation'].astype(float)
df['nextWordRegress'] = df['nextWordRegress'].astype(float)
df['label'] = df['label'].astype(float)
df['label'] = df['label'].astype('category')
print(df.dtypes)

In [None]:
eye_features = df.iloc[:,2:24]
labels = df.iloc[:, -1]
print(eye_features.columns, df.columns[27])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

enc = LabelBinarizer()

data_dmatrix = xgb.DMatrix(data=eye_features, label=labels)
X_train, X_test, y_train, y_test = train_test_split(eye_features, labels, test_size=0.2, random_state=123)
y_train = enc.fit_transform(y_train)
y_test = enc.fit_transform(y_test)

In [None]:
from sklearn.multiclass import OneVsRestClassifier

xg_classifier = xgb.XGBClassifier(objective ='reg:squarederror', learning_rate=0.5)
clf = OneVsRestClassifier(xg_classifier)

clf.fit(X_train,y_train)

preds = clf.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
acc = np.sum(np.argmax(y_test, 1) == np.argmax(preds, 1))/ len(preds)
print("Accuracy: %f" % (acc))