In [6]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

## Read Data

In [11]:
train = pd.read_csv("train_sample.csv")
test = pd.read_csv("test_sample.csv")
print(train.shape, test.shape)

(500, 8) (500, 7)


In [12]:
train.head(5)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,class
0,0.575166,0.492272,0.075526,0.746659,0.999195,0.603935,0.547301,1
1,0.743337,0.576061,0.117977,0.134242,0.512801,0.799901,0.107309,0
2,0.129573,0.1812,0.834259,0.919458,0.648286,0.009957,0.219813,0
3,0.429477,0.658689,0.298046,0.670946,0.292823,0.535506,0.186334,0
4,0.089494,0.921228,0.382613,0.929615,0.043172,0.249534,0.017286,0


In [13]:
test.head(5)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7
0,0.700851,0.096902,0.458938,0.691299,0.746814,0.018623,0.479225
1,0.223784,0.215223,0.627635,0.369224,0.654303,0.463932,0.669979
2,0.182544,0.676885,0.016264,0.299162,0.217877,0.933902,0.579752
3,0.084935,0.740668,0.086598,0.212974,0.261713,0.805061,0.113849
4,0.33071,0.647138,0.486327,0.112941,0.051484,0.612233,0.46949


In [28]:
# Prepare the data for xgboost
X_train = train.drop('class', axis=1)
y_train = train['class']
feature_columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7']
X_test = test[feature_columns]

## Xgboost

In [29]:
# Parameters for xgboost
param = {'objective': 'binary:logistic', 'eval_metric': 'auc'}

# Perform 5-fold cross-validation
dtrain = xgb.DMatrix(X_train, label=y_train)
cv_results = xgb.cv(param, dtrain, num_boost_round=1000, nfold=5, 
                    seed=0, stratified=True, 
                    early_stopping_rounds=10, metrics='auc')

# Best number of boosting rounds
n_best_rounds = cv_results.shape[0]

# Fit xgboost model
model = xgb.train(param, dtrain, num_boost_round=n_best_rounds)
dtest = xgb.DMatrix(X_test)
prediction = model.predict(dtest)

## Random Forest

In [30]:
# Apply Random Forest
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
importances = rf.feature_importances_

# Find the index of the most important feature
rf_most_important = np.argmax(importances) + 1

## Drop Answer

In [31]:
# Save to .npz file
np.savez('submission.npz', rf_most_important=rf_most_important, prediction=prediction)

# Load and check the contents of the saved file
npzfile = np.load('submission.npz')
print(npzfile['rf_most_important'], npzfile['prediction'].shape)

7 (500,)
