In [6]:
import xgboost as xgb
import sys
import IPython
import numpy as np
import pandas as pd
import sklearn as sk

print ('Python version: %s.%s.%s' % sys.version_info[:3])
print ('IPython version:', IPython.__version__)
print ('numpy version:', np.__version__)
print ('pandas version:', pd.__version__)
print ('scikit-learn version:', sk.__version__)
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
import feather
## Usando OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#data = pd.read_csv("../data/train.csv")
data = feather.read_dataframe('../data/transformed_data.feather')
data['Weekday'] = data['Weekday'].map({"monday": 1, "tuesday": 2, "wednesday": 3, "thursday": 4, "friday": 5,
"saturday": 6, "sunday": 7})
for each in ['TripType', 'Weekday']:
    data[each] = data[each].astype(int)


def add_category_counts(data):
    alist = []
    for array in np.asarray(data.iloc[:,11:]):
        count = 0
        for item in array:
            if item > 0:
                count += 1
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(11, 'CategoryCounts', cat_counts)
    return data

dummies = pd.get_dummies(data.DepartmentDescription)
data[dummies.columns] = dummies
data_dummies = data[dummies.columns] 
data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
data.loc[data.ScanCount < 0, 'Return'] = 1
data.loc[data.Return != 1, 'Return'] = 0
add_category_counts(data)
grouped = data.groupby("VisitNumber")
elset = set(data.DepartmentDescription)
aggregate = {'Weekday': np.max, "TripType": np.max, 'numItems': np.sum, 'Return': np.max, 'CategoryCounts': np.max}
for each in list(elset):
    if each != np.nan:
        aggregate[each] = np.sum
data = grouped.agg(aggregate).reset_index()


label_encoder = LabelEncoder() 
 
data['TripType_l'] = label_encoder.fit_transform(data['TripType'])

mytrain, mytest = train_test_split(data, test_size = .7)
features = ['VisitNumber', 'Weekday', 'Return', 'CategoryCounts'] + list(elset)
print(data.head())
train = data

Python version: 3.6.3
IPython version: 6.2.1
numpy version: 1.13.3
pandas version: 0.21.0
scikit-learn version: 0.19.1
  VisitNumber  Weekday  TripType  numItems  Return  CategoryCounts  \
0           5        5       999         1     0.0               1   
1           7        5        30         4     0.0               1   
2           8        5        26       529     0.0               1   
3           9        5         8         9     0.0               1   
4          10        5         8         9     0.0               1   

   frozen foods  pharmacy  seafood  electronics     ...      \
0             0         0        0            0     ...       
1             0         0        0            0     ...       
2             0         0        0            0     ...       
3             0         0        0            0     ...       
4             0         0        0            0     ...       

   financial services  sheer hosiery  paint and accessories  automotive  \
0     

In [7]:
mytrain, mytest = train_test_split(data, test_size = .3)

dtrain = xgb.DMatrix(np.asarray(mytrain[features]), label = np.asarray(mytrain.TripType_l))
dtest = xgb.DMatrix(np.asarray(mytest[features]), label = np.asarray(mytest.TripType_l))
num_round = 150
param = {'objective': 'multi:softprob', 'num_class':len(set(mytrain.TripType_l)), 
     'eval_metric': 'mlogloss', "max_delta_step": 5}
watchlist = [(dtrain,'train'), (dtest, 'eval')]

%time bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=3)


[0]	train-mlogloss:2.63329	eval-mlogloss:2.65786
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:2.21439	eval-mlogloss:2.25639
[2]	train-mlogloss:1.98769	eval-mlogloss:2.04151
[3]	train-mlogloss:1.82774	eval-mlogloss:1.89187
[4]	train-mlogloss:1.70782	eval-mlogloss:1.78095
[5]	train-mlogloss:1.61352	eval-mlogloss:1.69477
[6]	train-mlogloss:1.53689	eval-mlogloss:1.62586
[7]	train-mlogloss:1.47406	eval-mlogloss:1.5703
[8]	train-mlogloss:1.42065	eval-mlogloss:1.52408
[9]	train-mlogloss:1.37574	eval-mlogloss:1.48581
[10]	train-mlogloss:1.33716	eval-mlogloss:1.45379
[11]	train-mlogloss:1.30335	eval-mlogloss:1.42622
[12]	train-mlogloss:1.27324	eval-mlogloss:1.40242
[13]	train-mlogloss:1.24721	eval-mlogloss:1.38197
[14]	train-mlogloss:1.22314	eval-mlogloss:1.36402
[15]	train-mlogloss:1.20193	eval-mlogloss:1.348
[16]	train-mlogloss:1.18288	eval-mlogloss:1.33461
[17]	train-ml

In [8]:
def get_predictions(predictions):
    return [np.argmax(each) for each in predictions]
        
predictions = get_predictions(bst.predict(dtest))

#predictions = label_encoder.inverse_transform(predictions)
accuracy_score(mytest.TripType_l, predictions)

0.58854475141971219

In [9]:

data = feather.read_dataframe('../data/test_transformed_data.feather')
data['Weekday'] = data['Weekday'].map({"monday": 1, "tuesday": 2, "wednesday": 3, "thursday": 4, "friday": 5,
"saturday": 6, "sunday": 7})
for each in ['Weekday']:
    data[each] = data[each].astype(int)


dummies = pd.get_dummies(data.DepartmentDescription)
data[dummies.columns] = dummies
data_dummies = data[dummies.columns] 
data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])

data.loc[data.ScanCount < 0, 'Return'] = 1
data.loc[data.Return != 1, 'Return'] = 0


grouped = data.groupby("VisitNumber")
elset = set(data.DepartmentDescription)
aggregate = {'Weekday': np.max, 'numItems': np.sum, 'Return': np.max}
for each in list(elset):
    if each != np.nan:
        aggregate[each] = np.sum
data = grouped.agg(aggregate).reset_index()
add_category_counts(data)

data['health & beauty'] = np.zeros(len(data))

test = data
data_test = xgb.DMatrix(np.asarray(test[features]))

In [10]:
%time predictions = get_predictions(bst.predict(data_test))
predictions = label_encoder.inverse_transform(predictions)

CPU times: user 1min 1s, sys: 267 ms, total: 1min 2s
Wall time: 18.5 s


In [13]:
test['TripType'] = predictions
answers = test[['TripType', 'VisitNumber']]
answers = pd.get_dummies(answers, columns =['TripType'])
answers['TripType_14'] = np.zeros(len(answers))
answers['TripType_4'] = np.zeros(len(answers))
answers.to_csv('../data/final_submission.csv', index=False)
len(answers)

95674