In [1]:
import pandas
import datetime
import numpy as np
import xgboost as xgb
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import *
from sklearn.grid_search import GridSearchCV
from collections import OrderedDict

In [2]:
import sys
sys.path.append('/home/huang_anli/kaggle_tools/')

In [3]:
import load_data

In [4]:
load_data.mk_nfold('train.csv')

In [160]:
train_file = 'folds/0_train.csv'
test_file = 'folds/0_test.csv'
import imp
imp.reload(load_data)

<module 'load_data' from '/home/huang_anli/kaggle_tools/load_data.py'>

In [6]:
fill_dict = {}
train_data = load_data.init_data(train_file, 'top', fill_dict)
test_data = load_data.init_data(test_file, '', fill_dict)

In [7]:
test_data.SexuponOutcome[test_data.SexuponOutcome.isnull()]

Series([], Name: SexuponOutcome, dtype: object)

In [8]:
test_data.AgeuponOutcome[test_data.AgeuponOutcome.isnull()]

Series([], Name: AgeuponOutcome, dtype: object)

In [9]:
def parse_age(x):
    time, resolution = x.split(' ')
    time = int(time)
    if resolution.startswith('day'):
        return int(time)
    elif resolution.startswith('week'):
        return int(time * 7)
    elif resolution.startswith('month'):
        return int(time * 30)
    elif resolution.startswith('year'):
        return int(time * 365)
    else:
        pass

In [10]:
from feature_engineer import FeatureEncoder, one_hot

In [11]:
import pdb
class MyFeatureEncoder(FeatureEncoder):
    def __init__(self):
        super(FeatureEncoder, self).__init__()
        self.feature_dict = OrderedDict()
    
    def fit(self, raw_data):
        le = LabelEncoder()
        self.feature_dict['AnimalType'] = lambda x: le.fit_transform(x.AnimalType)
        self.feature_dict['month'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month \
         for xi in x.DateTime]
        self.feature_dict['season'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month // 3 \
         for xi in x.DateTime]
        
        self.feature_dict['10_day'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().day // 10  \
         for xi in x.DateTime]
        self.feature_dict['week'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().weekday()  \
         for xi in x.DateTime]
        self.feature_dict['hour'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").time().hour \
         for xi in x.DateTime]
        self.feature_dict['minute'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").time().minute \
         for xi in x.DateTime]
        self.feature_dict['Age_weeks'] = lambda x:[parse_age(i) // 7 for i in x.AgeuponOutcome]
        
        one_hot(self.feature_dict, 'AgeuponOutcome', raw_data)
        one_hot(self.feature_dict, 'SexuponOutcome', raw_data)
        one_hot(self.feature_dict, 'Breed', raw_data)
        one_hot(self.feature_dict, 'Color', raw_data)
   
        self.feature_dict['is_breed_mix'] = lambda x: [int(i.find('Mix') > 0) for i in x.Breed]
        self.feature_dict['is_color_mix'] = lambda x: [int(i.find('/') > 0) for i in x.Color]
        
    def transform(self, raw_data):
        transed_data = pandas.DataFrame()
        for key, func in self.feature_dict.items():
            if key.find('__')>0:
                transed_data[key] = func(raw_data[key.split('__')[0]], key.split('__')[1])
            else:
                transed_data[key] = func(raw_data)
        return transed_data
        
    def fit_transform(self, raw_data):
        self.fit(raw_data)
        return self.transform(raw_data)
    

In [12]:
feature_encoder = MyFeatureEncoder()

In [13]:
train_feature = feature_encoder.fit_transform(train_data)

In [16]:
le = LabelEncoder()
train_feature['OutcomeType'] = le.fit_transform(train_data['OutcomeType'])

In [17]:
test_feature = feature_encoder.transform(test_data)
test_feature['OutcomeType'] = le.fit_transform(test_data['OutcomeType'])

In [18]:
import base_models
imp.reload(base_models)

<module 'base_models' from '/home/huang_anli/kaggle_tools/base_models.py'>

In [20]:
from sklearn.metrics import log_loss
log_loss(test_feature.OutcomeType, lr_clf.predict_proba(test_feature.iloc[:,:-1]))

0.86551272681179248

In [None]:
xg_param = {
    'scoring' : 'log_loss',
    'objective' : 'multi:softprob'
}
xg_clf = base_models.gen_best_xgboost(train_feature.iloc[:,:-1], train_feature.OutcomeType, **xg_param)
xg_clf.fit(train_feature.iloc[:,:-1], train_feature.OutcomeType)
log_loss(test_feature.OutcomeType, xg_clf.predict_proba(test_feature.iloc[:,:-1]))

In [31]:
log_loss(test_feature.OutcomeType, xg_clf.predict_proba(test_feature.iloc[:,:-1]))

0.77916592648540972

In [81]:
rf_param = {
    'scoring' : 'log_loss'
}
rf_clf = base_models.gen_best_rf(train_feature.iloc[:,:-1], train_feature.OutcomeType, **rf_param)
rf_clf.fit(train_feature.iloc[:,:-1], train_feature.OutcomeType)
log_loss(test_feature.OutcomeType, rf_clf.predict_proba(test_feature.iloc[:,:-1]))

1.1315390432716415

In [102]:
import math
class NormalizedEncoder(FeatureEncoder):
    def __init__(self):
        super(FeatureEncoder, self).__init__()
        self.feature_dict = OrderedDict()
    
    def fit(self, raw_data):
        le = LabelEncoder()
        self.feature_dict['AnimalType'] = lambda x: le.fit_transform(x.AnimalType)
        self.feature_dict['month'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month / 12.0 \
         for xi in x.DateTime]
        self.feature_dict['season'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month // 4 / 3.0 \
         for xi in x.DateTime]
        
        self.feature_dict['10_day'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().day // 10 / 3.0 \
         for xi in x.DateTime]
        self.feature_dict['week'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().weekday() / 7.0 \
         for xi in x.DateTime]
        self.feature_dict['Age_weeks'] = lambda x:[math.log(parse_age(i) // 7 + 1) / 10.0 for i in x.AgeuponOutcome]
        
        one_hot(self.feature_dict, 'AgeuponOutcome', raw_data)
        one_hot(self.feature_dict, 'SexuponOutcome', raw_data)
        one_hot(self.feature_dict, 'Breed', raw_data)
        one_hot(self.feature_dict, 'Color', raw_data)
   
        self.feature_dict['is_breed_mix'] = lambda x: [int(i.find('Mix') > 0) for i in x.Breed]
        self.feature_dict['is_color_mix'] = lambda x: [int(i.find('/') > 0) for i in x.Color]
        
    def transform(self, raw_data):
        transed_data = pandas.DataFrame()
        for key, func in self.feature_dict.items():
            if key.find('__')>0:
                transed_data[key] = func(raw_data[key.split('__')[0]], key.split('__')[1])
            else:
                transed_data[key] = func(raw_data)
        return transed_data
        
    def fit_transform(self, raw_data):
        self.fit(raw_data)
        return self.transform(raw_data)

In [103]:
nfe = NormalizedEncoder()

In [104]:
ffm_train_feature = nfe.fit_transform(train_data)
le.fit(train_data.OutcomeType)
ffm_train_feature['OutcomeType'] = le.transform(train_data.OutcomeType) / 5.0

In [105]:
ffm_train_feature.to_csv('ffm_features', index=False)
os.system('bash /home/huang_anli/kaggle_tools/ffm_trans.sh OutcomeType')
os.system('mv ffm_features.transed ffm_feature.train')

0

In [106]:
ffm_test_feature = nfe.transform(test_data)
ffm_test_feature['OutcomeType'] = le.transform(test_data.OutcomeType) / 5.0
ffm_test_feature.to_csv('ffm_features', index=False)
os.system('bash /home/huang_anli/kaggle_tools/ffm_trans.sh OutcomeType')
os.system('mv ffm_features.transed ffm_feature.test')

0

In [107]:
os.system('/home/huang_anli/kaggle_tools/libffm/ffm-train --auto-stop -p ffm_feature.test -s 15 ffm_feature.train')

0

In [108]:
os.system('/home/huang_anli/kaggle_tools/libffm/ffm-predict ffm_feature.test ffm_feature.train.model ffm_feature.test.res')

0

In [139]:
class FFMClassifier(object):
    def __init__(self,
                 model='ffm_feature.train.model', 
                 executor='/home/huang_anli/kaggle_tools/libffm/ffm-predict'):
        super().__init__()
        self.model = model
        self.executor = executor
    
    def fit(self, raw_data):
        nfe = NormalizedEncoder()
        le = LabelEncoder()
        raw_feature = nfe.fit_transform(raw_data)
        raw_feature['OutcomeType'] = le.transform(raw_data.OutcomeType) / 5.0
        raw_feature.to_csv('ffm_features', index=False)
        os.system('bash /home/huang_anli/kaggle_tools/ffm_trans.sh OutcomeType')
        os.system('mv ffm_features.transed ffm_feature.train')
        os.system('/home/huang_anli/kaggle_tools/libffm/ffm-train -s 15 ffm_feature.train')
    
    def predict_proba(self, raw_data):
        res_file = 'ffm_feature.train.res'
        raw_feature = nfe.transform(raw_data)
        raw_feature['OutcomeType'] = le.transform(raw_data.OutcomeType) / 5.0
        raw_feature.to_csv('ffm_features', index=False)
        os.system('bash /home/huang_anli/kaggle_tools/ffm_trans.sh OutcomeType')

        os.system('{executor} ffm_features.transed {model} {res_file}'.format(executor = self.executor,
                                                                       model = self.model,
                                                                       res_file = res_file))
        res = open(res_file, 'r').readlines()
        return [float(i.rstrip()) for i in res]
ffm_clf = FFMClassifier()

In [128]:
clfs = {
    'lr' : lr_clf, 
    'xg' : xg_clf, 
    'rf' : rf_clf,
    }

In [129]:
stacking_feature = pandas.DataFrame()
for name, clf in clfs.items():
    print(name)
    res = clf.predict_proba(train_feature.iloc[:,:-1])
    if isinstance(res[0], float):
        key = name
        stacking_feature[key] = res
    else:
        for i in range(len(res[0])):
            key = name + '__' + str(i)
            stacking_feature[key] = res[:, i]

lr
xg
rf


In [140]:
res = ffm_clf.predict_proba(train_data)

In [142]:
stacking_feature['ffm'] = res

In [155]:
stacking_feature['OutcomeType'] = le.transform(train_data.OutcomeType)

In [158]:
rf_param = {
    'scoring' : 'log_loss'
}

stack_lr_clf = base_models.gen_best_rf(stacking_feature.iloc[:,:-1], stacking_feature.OutcomeType, **rf_param)
stack_lr_clf.fit(stacking_feature.iloc[:,:-1], stacking_feature.OutcomeType)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.05, n_estimators=200, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [151]:
stacking_test = pandas.DataFrame()
for name, clf in clfs.items():
    print(name)
    res = clf.predict_proba(test_feature.iloc[:,:-1])
    if isinstance(res[0], float):
        key = name
        stacking_test[key] = res
    else:
        for i in range(len(res[0])):
            key = name + '__' + str(i)
            stacking_test[key] = res[:, i]
res = ffm_clf.predict_proba(test_data)
stacking_test['ffm'] = res
stacking_test['OutcomeType'] = test_data.OutcomeType

lr
xg
rf


In [159]:
log_loss(stacking_test.OutcomeType, stack_lr_clf.predict_proba(stacking_test.iloc[:,:-1]))

0.80258591490842834

In [161]:
stack_xg_clf = base_models.gen_best_xgboost(stacking_feature.iloc[:,:-1], stacking_feature.OutcomeType, **xg_param)

best tree params:  {'min_child_weight': 5, 'max_depth': 3}
best score currently:  -0.570264624595
best gamma params:  {'gamma': 0.7}
best score currently:  -0.570012736827
best subsample:  {'subsample': 0.8}
best score currently:  -0.570012736827
best reg:  {'reg_alpha': 0.005}
best score currently:  -0.569949310677


In [163]:
stack_xg_clf.fit(stacking_feature.iloc[:,:-1], stacking_feature.OutcomeType)
log_loss(stacking_test.OutcomeType, stack_xg_clf.predict_proba(stacking_test.iloc[:,:-1]))

0.85803073229098392

In [168]:
imp.reload(load_data)

<module 'load_data' from '/home/huang_anli/kaggle_tools/load_data.py'>

In [173]:
train_data = load_data.init_data('test.csv', "", fill_dict)
train_feature = feature_encoder.transform(train_data)
res = xg_clf.predict_proba(train_feature)
imp.reload(write_data)
write_data.gen_ans(res, train_data.ID, le.classes_, 'xgboost_2.csv')

ID do not have default value


AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

0

In [177]:
imp.reload(write_data)
write_data.gen_ans(res, train_data.ID, le.classes_, 'xgboost_2.csv')

0