This notebook will help you get started solving the given problem. In this challenge, we have to predict the attack type. 

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 1000)

In [4]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [5]:
print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

The train data has 169307 rows and 43 columns
The test data has 91166 rows and 42 columns


In [6]:
## check target class
train['target'].value_counts(normalize=True)

0    0.583957
2    0.217676
1    0.198367
Name: target, dtype: float64

There aren't any missing values. Let's jump to building models to get some baseline score.

In [7]:
## check missing values
train.isnull().sum(axis=0) ## there are no missing values.

connection_id    0
cont_1           0
cont_2           0
cont_3           0
cont_4           0
cont_5           0
cont_6           0
cont_7           0
cont_8           0
cont_9           0
cont_10          0
cont_11          0
cont_12          0
cont_13          0
cont_14          0
cont_15          0
cont_16          0
cont_17          0
cont_18          0
cat_1            0
cat_2            0
cat_3            0
cat_4            0
cat_5            0
cat_6            0
cat_7            0
cat_8            0
cat_9            0
cat_10           0
cat_11           0
cat_12           0
cat_13           0
cat_14           0
cat_15           0
cat_16           0
cat_17           0
cat_18           0
cat_19           0
cat_20           0
cat_21           0
cat_22           0
cat_23           0
target           0
dtype: int64

### Model - 0 (Majority Class)

In [8]:
## lets make a submission with all 0s
sub = pd.read_csv('sample_submission.csv')
sub['target'] = 0
sub.to_csv('sub0.csv', index=False) ## 0.58

### Model 1 - XGB

In [9]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import xgboost as xgb



In [10]:
feature_names = [x for x in train.columns if x not in ['connection_id','target']]
target = train['target']

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size = 0.7, stratify = target, random_state = 2017)



In [12]:
## function for multi-accuracy
from sklearn.metrics import accuracy_score
def multAcc(pred, dtrain):
    label = dtrain.get_label()
    acc = accuracy_score(label, pred)
    return 'maccuracy', acc

In [13]:
# default parameters
params = {'objective':'multi:softmax',
          'num_class':3,
          # 'eval_metric':'merror'
         }

In [14]:
dtrain = xgb.DMatrix(data=X_train[feature_names], label=y_train)
dvalid = xgb.DMatrix(data=X_valid[feature_names], label=y_valid)
dtest = xgb.DMatrix(data=test[feature_names])
watchlist = [(dtrain, 'train'),(dvalid, 'eval')]

In [15]:
clf1 = xgb.train(params, dtrain, 1000, watchlist, maximize=True, verbose_eval=20, early_stopping_rounds=1000, feval=multAcc)

[0]	train-maccuracy:0.992024	eval-maccuracy:0.366262
Multiple eval metrics have been passed: 'eval-maccuracy' will be used for early stopping.

Will train until eval-maccuracy hasn't improved in 40 rounds.
[20]	train-maccuracy:0.995054	eval-maccuracy:0.36699
[40]	train-maccuracy:0.995199	eval-maccuracy:0.36699
[60]	train-maccuracy:0.995319	eval-maccuracy:0.36699
[80]	train-maccuracy:0.995384	eval-maccuracy:0.367051
Stopping. Best iteration:
[44]	train-maccuracy:0.995223	eval-maccuracy:0.367051



In [16]:
pred = clf1.predict(dtest)

In [17]:
## make submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub1.csv', index=False)

### Model 2 - Decision Tree

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

In [74]:
## set up model
clf2 = GradientBoostingClassifier(n_estimators = 130,learning_rate=0.083,max_depth=4, min_samples_split=6)

In [75]:
## train model
clf2.fit(train[feature_names], target)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.083, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=6,
              min_weight_fraction_leaf=0.0, n_estimators=130,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [76]:
## make prediction
pred2 = clf2.predict(test[feature_names])

In [77]:
## make submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred2
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub2.csv', index=False)

### Model 3 - Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
clf3 = LogisticRegression()

In [25]:
clf3.fit(train[feature_names], target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
pred3 = clf3.predict(test[feature_names])

In [27]:
## make submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred3
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub3.csv', index=False)