In [1]:
import pandas
import datetime
import numpy as np
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import *
from sklearn.grid_search import GridSearchCV
from collections import OrderedDict

In [2]:
import sys
sys.path.append('/home/huang_anli/kaggle_tools/')

In [3]:
import load_data

In [4]:
load_data.mk_nfold('train.csv')

In [5]:
mlb = MultiLabelBinarizer()
le = LabelEncoder()

In [6]:
le.fit(data.OutcomeType)

NameError: name 'data' is not defined

In [11]:
le.classes_

array(['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'], dtype=object)

In [12]:
breed_count = data.Breed.value_counts()
color_count = data.Color.value_counts()

In [13]:
def parse_age(x):
    time, resolution = x.split(' ')
    time = int(time)
    if resolution.startswith('day'):
        return int(time)
    elif resolution.startswith('week'):
        return int(time * 7)
    elif resolution.startswith('month'):
        return int(time * 30)
    elif resolution.startswith('year'):
        return int(time * 365)
    else:
        print(x)

In [14]:
for i in data.index:
    print(data.loc[i])
    break

AnimalID                        A671945
Name                            Hambone
DateTime            2014-02-12 18:22:00
OutcomeType             Return_to_owner
OutcomeSubtype                      NaN
AnimalType                          Dog
SexuponOutcome            Neutered Male
AgeuponOutcome                   1 year
Breed             Shetland Sheepdog Mix
Color                       Brown/White
Name: 0, dtype: object


In [18]:
import pdb
class FeatureEncoder(object):
    def __init__(self):
        super(FeatureEncoder, self).__init__()
        self.feature_dict = OrderedDict()
    
    def fit(self, raw_data):
        self.feature_dict['AnimalType'] = lambda x: le.fit_transform(x.AnimalType)
        self.feature_dict['month'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month \
         for xi in x.DateTime]
        self.feature_dict['season'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month // 4 \
         for xi in x.DateTime]
        
        self.feature_dict['10_day'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().day // 10 \
         for xi in x.DateTime]
        self.feature_dict['week'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month // 4 \
         for xi in x.DateTime]
        self.feature_dict['Age_weeks'] = lambda x:[parse_age(i) // 7 for i in x.AgeuponOutcome]
        """
        mlb.fit([[xi] for xi in raw_data['AgeuponOutcome']])
        for i in mlb.classes_:
            self.feature_dict['AgeuponOutcome__' + i] = lambda x,i:[int(j == i) for j in x.AgeuponOutcome]
        """
        mlb.fit([[xi] for xi in raw_data['SexuponOutcome']])
        for i in mlb.classes_:
            self.feature_dict['SexuponOutcome__' + i] = lambda x,i:[int(j == i) for j in x.SexuponOutcome]
        
        breed_count = raw_data.Breed.value_counts()
        mlb.fit([[xi] for xi in raw_data['Breed']])
        for i in mlb.classes_:
            if breed_count.loc[i] > 10:
                self.feature_dict['Breed__' + i] = lambda x,i:[int(j == i) for j in x.Breed]
        del breed_count
    
        color_count = raw_data.Color.value_counts()
        mlb.fit([[xi] for xi in raw_data['Color']])
        for i in mlb.classes_:
            if color_count.loc[i] > 10:
                self.feature_dict['Color__' + i] = lambda x,i:[int(j == i) for j in x.Color]
        del color_count
   
        self.feature_dict['is_breed_mix'] = lambda x: [int(i.find('Mix') > 0) for i in x.Breed]
        self.feature_dict['is_color_mix'] = lambda x: [int(i.find('/') > 0) for i in x.Color]
        
    def transform(self, raw_data):
        transed_data = pandas.DataFrame()
        for key, func in self.feature_dict.items():
            if key.find('__')>0:
                transed_data[key] = func(raw_data, key.split('__')[1])
            else:
                transed_data[key] = func(raw_data)
        return transed_data
        
    def fit_transform(self, raw_data):
        self.fit(raw_data)
        return self.transform(raw_data)
    

In [20]:
feature_encoder = FeatureEncoder()

In [21]:
train_data = feature_encoder.fit_transform(data)

In [22]:
train_data['OutcomeType'] = le.fit_transform(data['OutcomeType'])

In [23]:
data_to_predict = pandas.read_csv('test.csv')
data_to_predict['SexuponOutcome'].fillna(inplace = True, value = data['SexuponOutcome'].describe().top)
data_to_predict['AgeuponOutcome'].fillna(inplace = True, value = data['AgeuponOutcome'].describe().top)
test_data = feature_encoder.transform(data_to_predict)

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import ExtraTreeRegressor

In [25]:
dt = DecisionTreeClassifier()

In [26]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(train_data.iloc[:,:-1], train_data.OutcomeType)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [31]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators = 100, 
    criterion='entropy', 
    max_depth=30,
    min_samples_split=5, 
    min_weight_fraction_leaf=0.2, 
    max_features='auto', 
    max_leaf_nodes=None,
    bootstrap=True,
    oob_score=False, 
    n_jobs=4, 
    random_state=None, 
    verbose=1, 
    warm_start=False, 
    class_weight=None
)

In [152]:
kf = KFold(len(train_data), 5, shuffle=True)
for train_idx, test_idx in kf:
    X_train = train_data.iloc[train_idx]
    X_test = train_data.iloc[test_idx]
    clf.fit(X_train.iloc[:,:-1], X_train.OutcomeType)
    ans = clf.predict_proba(X_test.iloc[:,:-1])
    print(log_loss(X_test.OutcomeType, ans))
    


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


1.19533340698


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


1.21644030331


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


1.20938783681


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


1.22742939446


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


1.21654673543


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [40]:
clf.fit(train_data.iloc[:,:-1], train_data.OutcomeType)
gen_output(clf.predict_proba(train_data.iloc[:,:-1]), data, 'simple_train_randomForest.csv')

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


In [59]:
def gen_output(res, reference, file_name):
    le = LabelEncoder()
    le.fit(data['OutcomeType'])
    res_data = pandas.DataFrame()
    res_data['ID'] = reference.AnimalID
    for i in range(0, 5):
        res_data[le.classes_[i]] = res[:,i]
    res_data.to_csv(file_name, index = False)

In [38]:
data

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White


In [51]:
predicts = None
n_jobs = 4
def model_fit(alg, dtrain, predictors, cv_folds=5, early_stopping_rounds=50):
    xgb_param = alg.get_xgb_params()
    xgb_param['num_class'] = 5
    #Fit the algorithm on the data
    kf = KFold(len(dtrain), cv_folds, shuffle=True)
    for train_idx, test_idx in kf:
        X_train = dtrain.iloc[train_idx]
        X_test = dtrain.iloc[test_idx]
        alg.fit(X_train.iloc[:,:-1],
                X_train.OutcomeType, 
                eval_metric='mlogloss',
               verbose=True)
        try:
            print(alg.evals_result())
        except:
            pass
        #Predict training set:
        dtrain_predictions = alg.predict(X_test.iloc[:,:-1])
        global predicts
        predicts = alg.predict_proba(X_test.iloc[:,:-1])
        #Print model report:
        print("Model Report")
        print("Accuracy : %.4g" % accuracy_score(X_test.OutcomeType, dtrain_predictions))
        print("LogLoss : %.4g" % log_loss(X_test.OutcomeType, predicts))
        break
                            

In [52]:
import xgboost as xgb
evals_res = {}
base_model = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=140,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 nthread=8,
 scale_pos_weight=1,
 seed=27)


In [53]:
model_fit(base_model, train_data, None)

Model Report
Accuracy : 0.6356
LogLoss : 0.8722


In [54]:
print(datetime.datetime.today())
param_test1 = {
    'max_depth' : list(range(3, 10, 2)),
    'min_child_weight' : list(range(1, 6, 2))
}

gsearch1 = GrikkdSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        gamma=0,
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'multi:softprob', 
        nthread=6, 
        scale_pos_weight=1, 
        seed=27), 
        param_grid = param_test1, 
        scoring='log_loss',
        n_jobs=3,
        iid=True,
        cv=5)
gsearch1.fit(train_data.iloc[:,:-1],train_data.OutcomeType)
print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
print(datetime.datetime.today())

2016-05-24 16:55:59.760670
[mean: -0.87682, std: 0.00603, params: {'min_child_weight': 1, 'max_depth': 3}, mean: -0.87699, std: 0.00554, params: {'min_child_weight': 3, 'max_depth': 3}, mean: -0.87690, std: 0.00567, params: {'min_child_weight': 5, 'max_depth': 3}, mean: -0.87098, std: 0.00527, params: {'min_child_weight': 1, 'max_depth': 5}, mean: -0.87080, std: 0.00500, params: {'min_child_weight': 3, 'max_depth': 5}, mean: -0.87125, std: 0.00528, params: {'min_child_weight': 5, 'max_depth': 5}, mean: -0.87198, std: 0.00507, params: {'min_child_weight': 1, 'max_depth': 7}, mean: -0.87211, std: 0.00446, params: {'min_child_weight': 3, 'max_depth': 7}, mean: -0.87263, std: 0.00482, params: {'min_child_weight': 5, 'max_depth': 7}, mean: -0.87616, std: 0.00391, params: {'min_child_weight': 1, 'max_depth': 9}, mean: -0.87631, std: 0.00393, params: {'min_child_weight': 3, 'max_depth': 9}, mean: -0.87669, std: 0.00505, params: {'min_child_weight': 5, 'max_depth': 9}] {'min_child_weight': 3, 

In [36]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -0.87483, std: 0.00638, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: -0.87515, std: 0.00593, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: -0.87528, std: 0.00583, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: -0.87000, std: 0.00572, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: -0.87022, std: 0.00543, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: -0.87062, std: 0.00538, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: -0.87184, std: 0.00464, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: -0.87171, std: 0.00438, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: -0.87190, std: 0.00493, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: -0.87623, std: 0.00371, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: -0.87599, std: 0.00411, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: -0.87580, std: 0.00462, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 5, 'min_chil

In [55]:
param_test1 = {
    'max_depth' : [4,5,6],
    'min_child_weight' : [1,2,3]
}

gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140,  
        gamma=0,
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'multi:softprob', 
        nthread=6, 
        scale_pos_weight=1, 
        seed=27), 
        param_grid = param_test1, 
        scoring='log_loss',
        n_jobs=3,
        iid=False,
        cv=5)
gsearch1.fit(train_data.iloc[:,:-1],train_data.OutcomeType)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -0.87267, std: 0.00538, params: {'min_child_weight': 1, 'max_depth': 4},
  mean: -0.87270, std: 0.00522, params: {'min_child_weight': 2, 'max_depth': 4},
  mean: -0.87253, std: 0.00540, params: {'min_child_weight': 3, 'max_depth': 4},
  mean: -0.87098, std: 0.00527, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: -0.87108, std: 0.00535, params: {'min_child_weight': 2, 'max_depth': 5},
  mean: -0.87080, std: 0.00500, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: -0.87131, std: 0.00503, params: {'min_child_weight': 1, 'max_depth': 6},
  mean: -0.87163, std: 0.00480, params: {'min_child_weight': 2, 'max_depth': 6},
  mean: -0.87116, std: 0.00440, params: {'min_child_weight': 3, 'max_depth': 6}],
 {'max_depth': 5, 'min_child_weight': 3},
 -0.87079996810794458)

In [56]:
param_test3 = {
 'gamma':[i/10.0 for i in range(1,10)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        max_depth=5,
        min_child_weight=2, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'multi:softprob', 
        nthread=5, 
        scale_pos_weight=1,
        seed=27), 
        param_grid = param_test3, 
        scoring='log_loss',
        n_jobs=3,
        iid=False, 
        cv=5)
gsearch3.fit(train_data.iloc[:,:-1],train_data.OutcomeType)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: -0.87111, std: 0.00520, params: {'gamma': 0.1},
  mean: -0.87116, std: 0.00511, params: {'gamma': 0.2},
  mean: -0.87119, std: 0.00523, params: {'gamma': 0.3},
  mean: -0.87130, std: 0.00503, params: {'gamma': 0.4},
  mean: -0.87108, std: 0.00515, params: {'gamma': 0.5},
  mean: -0.87090, std: 0.00494, params: {'gamma': 0.6},
  mean: -0.87105, std: 0.00499, params: {'gamma': 0.7},
  mean: -0.87122, std: 0.00503, params: {'gamma': 0.8},
  mean: -0.87123, std: 0.00495, params: {'gamma': 0.9}],
 {'gamma': 0.6},
 -0.87089871108098438)

In [20]:
param_test3 = {
 'subsample':[i/100.0 for i in range(84, 90, 1)],
}

gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        max_depth=6,
        min_child_weight=3, 
        gamma=0.2, 
        objective= 'multi:softprob', 
        nthread=4, 
        colsample_bytree=0.8,
        scale_pos_weight=1,
        seed=27), 
        param_grid = param_test3, 
        scoring='log_loss',
        n_jobs=n_jobs,
        iid=False, 
        cv=5)
gsearch3.fit(train_data.iloc[:,:-1],train_data.OutcomeType)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: -0.85816, std: 0.00585, params: {'subsample': 0.84},
  mean: -0.85832, std: 0.00567, params: {'subsample': 0.85},
  mean: -0.85841, std: 0.00626, params: {'subsample': 0.86},
  mean: -0.85805, std: 0.00596, params: {'subsample': 0.87},
  mean: -0.85779, std: 0.00598, params: {'subsample': 0.88},
  mean: -0.85809, std: 0.00568, params: {'subsample': 0.89}],
 {'subsample': 0.88},
 -0.85778583487594973)

In [64]:
import xgboost as xgb 
modified_model = xgb.XGBClassifier( 
        learning_rate =0.05, 
        n_estimators=140, 
        max_depth=5,
        min_child_weight=1, 
        gamma=0.6, 
        objective= 'multi:softprob', 
        nthread=6, 
        colsample_bytree=0.8,
        subsample = 0.82,
        scale_pos_weight=1,
        seed=27)

In [65]:
model_fit(modified_model, train_data, None)

Model Report
Accuracy : 0.6347
LogLoss : 0.8796


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
clf = LogisticRegression(multi_class='ovr')

In [55]:
cross_val_score(clf, train_data.iloc[:,:-1], train_data.OutcomeType, scoring='log_loss', cv=5, n_jobs=4, verbose=1)

[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    3.8s finished


array([-0.95144958, -0.94608002, -0.93631268, -0.94104232, -0.9340475 ])

In [60]:
clf.fit(train_data.iloc[:,:-1], train_data.OutcomeType)
res = clf.predict_proba(train_data.iloc[:,:-1])

In [65]:
gen_output(res, data, 'simple_train_LR.csv')

In [70]:
data.OutcomeType.value_counts().items()

<zip at 0x7fbfa3e49908>

In [71]:
for key, value in data.OutcomeType.value_counts().items():
    print(key, value)

Adoption 10769
Transfer 9422
Return_to_owner 4786
Euthanasia 1555
Died 197
