This notebook will help you get started solving the given problem. In this challenge, we have to predict the attack type. 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 1000)

In [3]:
print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

The train data has 169307 rows and 43 columns
The test data has 91166 rows and 42 columns


In [4]:
## check target class
train['target'].value_counts(normalize=True)

0    0.583957
2    0.217676
1    0.198367
Name: target, dtype: float64

There aren't any missing values. Let's jump to building models to get some baseline score.

In [6]:
## check missing values
train.isnull().sum(axis=0) ## there are no missing values.

connection_id    0
cont_1           0
cont_2           0
cont_3           0
cont_4           0
cont_5           0
cont_6           0
cont_7           0
cont_8           0
cont_9           0
cont_10          0
cont_11          0
cont_12          0
cont_13          0
cont_14          0
cont_15          0
cont_16          0
cont_17          0
cont_18          0
cat_1            0
cat_2            0
cat_3            0
cat_4            0
cat_5            0
cat_6            0
cat_7            0
cat_8            0
cat_9            0
cat_10           0
cat_11           0
cat_12           0
cat_13           0
cat_14           0
cat_15           0
cat_16           0
cat_17           0
cat_18           0
cat_19           0
cat_20           0
cat_21           0
cat_22           0
cat_23           0
target           0
dtype: int64

### Model - 0 (Majority Class)

In [48]:
## lets make a submission with all 0s
sub = pd.read_csv('sample_submission.csv')
sub['target'] = 0
sub.to_csv('sub0.csv', index=False) ## 0.58

### Model 1 - XGB

In [12]:
from sklearn.model_selection import train_test_split
import xgboost as xgb



In [16]:
feature_names = [x for x in train.columns if x not in ['connection_id','target']]
target = train['target']

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size = 0.7, stratify = target, random_state = 2017)

In [25]:
## function for multi-accuracy
from sklearn.metrics import accuracy_score
def multAcc(pred, dtrain):
    label = dtrain.get_label()
    acc = accuracy_score(label, pred)
    return 'maccuracy', acc

In [23]:
# default parameters
params = {'objective':'multi:softmax',
          'num_class':3,
          # 'eval_metric':'merror'
         }

In [19]:
dtrain = xgb.DMatrix(data=X_train[feature_names], label=y_train)
dvalid = xgb.DMatrix(data=X_valid[feature_names], label=y_valid)
dtest = xgb.DMatrix(data=test[feature_names])
watchlist = [(dtrain, 'train'),(dvalid, 'eval')]

In [26]:
clf1 = xgb.train(params, dtrain, 1000, watchlist, maximize=True, verbose_eval=20, early_stopping_rounds=40, feval=multAcc)

[0]	train-maccuracy:0.778887	eval-maccuracy:0.778729
Multiple eval metrics have been passed: 'eval-maccuracy' will be used for early stopping.

Will train until eval-maccuracy hasn't improved in 40 rounds.
[20]	train-maccuracy:0.781773	eval-maccuracy:0.781013
[40]	train-maccuracy:0.782591	eval-maccuracy:0.781033
Stopping. Best iteration:
[1]	train-maccuracy:0.778684	eval-maccuracy:0.778198



In [27]:
pred = clf1.predict(dtest)

In [32]:
## make submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub1.csv', index=False)

### Model 2 - Decision Tree

In [40]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [41]:
## set up model
clf2 = DecisionTreeClassifier(max_depth=8, min_samples_split=7, max_features='sqrt')

In [42]:
## train model
clf2.fit(train[feature_names], target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=7, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [44]:
## make prediction
pred2 = clf2.predict(test[feature_names])

In [47]:
## make submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred2
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub2.csv', index=False)

### Model 3 - Logistic Regression

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
clf3 = LogisticRegression()

In [51]:
clf3.fit(train[feature_names], target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
pred3 = clf3.predict(test[feature_names])

In [54]:
## make submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred3
sub['target'] = sub['target'].astype(int)
sub.to_csv('sub3.csv', index=False)