In [19]:
import pandas
import datetime
import numpy as np
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import *
from sklearn.grid_search import GridSearchCV

In [2]:
data = pandas.read_csv('train.csv')

In [3]:
mlb = MultiLabelBinarizer()
le = LabelEncoder()

In [4]:
breed_count = data.Breed.value_counts()
color_count = data.Color.value_counts()

In [5]:
def parse_age(x):
    time, resolution = x.split(' ')
    time = int(time)
    if resolution.startswith('day'):
        return int(time)
    elif resolution.startswith('week'):
        return int(time * 7)
    elif resolution.startswith('month'):
        return int(time * 30)
    elif resolution.startswith('year'):
        return int(time * 365)
    else:
        print(x)

In [6]:
for i in data.index:
    print(data.loc[i])
    break

AnimalID                        A671945
Name                            Hambone
DateTime            2014-02-12 18:22:00
OutcomeType             Return_to_owner
OutcomeSubtype                      NaN
AnimalType                          Dog
SexuponOutcome            Neutered Male
AgeuponOutcome                   1 year
Breed             Shetland Sheepdog Mix
Color                       Brown/White
Name: 0, dtype: object


In [7]:
try:
    del train_data
except:
    pass
train_data = pandas.DataFrame()


def gen_feature(raw_data, transed_data):
    if 'OutcomeType' in raw_data:
        transed_data['OutcomeType'] = le.fit_transform(raw_data['OutcomeType'])
    transed_data['AnimalType'] = le.fit_transform(raw_data['AnimalType'])
    #众数填充
    raw_data['SexuponOutcome'].fillna(inplace = True, value = data['SexuponOutcome'].describe().top)
    raw_data['AgeuponOutcome'].fillna(inplace = True, value = data['AgeuponOutcome'].describe().top)
    date_list = [datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S") for x in raw_data['DateTime']]
    transed_data['month'] = [x.date().month for x in date_list]
    transed_data['season'] = [int(x.date().month / 4) for x in date_list]
    transed_data['10_day'] = [int(x.date().day / 10) for x in date_list]
    transed_data['week'] = [x.date().isoweekday() for x in date_list]
    transed_data['Age_days'] = data.AgeuponOutcome.apply(parse_age) 
    transed_data['Age_weeks'] = transed_data['Age_days'] // int(7)
    transed_data['Age_months'] = transed_data['Age_days'] // int(30)
    age_cat = mlb.fit_transform([[x] for x in raw_data['AgeuponOutcome']])
    for i, val in zip(mlb.classes_, age_cat.T):
        transed_data['AgeuponOutcome_' + i.replace(' ','_')] = val
    del age_cat
    
    sex_cat = mlb.fit_transform([[x] for x in raw_data['SexuponOutcome']])
    for i, val in zip(mlb.classes_, sex_cat.T):
        transed_data['SexuponOutcome_' + i.replace(' ','_')] = val
    del sex_cat
   
    breed_cat = mlb.fit_transform([[x] for x in raw_data['Breed']])
    for i, val in zip(mlb.classes_, breed_cat.T):
        if breed_count.loc[i] > 30:
            transed_data['Breed_' + i.replace(' ', '_')] = val
    del breed_cat
    
    color_cat = mlb.fit_transform([[x] for x in raw_data['Color']])
    for i, val in zip(mlb.classes_, color_cat.T):
        if color_count.loc[i] > 30:
            transed_data['Color_' + i.replace(' ','_')] = val
    del color_cat
   
    transed_data['is_breed_mix'] = data.Breed.apply(lambda x: int(x.find('Mix') > 0))
    transed_data['is_color_mix'] = data.Color.apply(lambda x: int(x.find('/') > 0))


In [8]:
import pdb
def convert_feature(raw_data, transed_data, referer):
    le.fit(raw_data['AnimalType'])
    transed_data['AnimalType'] = le.transform(raw_data['AnimalType'])
    #众数填充
    raw_data['SexuponOutcome'].fillna(inplace = True, value = data['SexuponOutcome'].describe().top)
    raw_data['AgeuponOutcome'].fillna(inplace = True, value = data['AgeuponOutcome'].describe().top)
    date_list = [datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S") for x in raw_data['DateTime']]
    transed_data['Age_days'] = raw_data.AgeuponOutcome.apply(parse_age) 
    transed_data['Age_weeks'] = transed_data['Age_days'] // int(7)
    transed_data['Age_months'] = transed_data['Age_days'] // int(30)
    transed_data['month'] = [x.date().month for x in date_list]
    transed_data['season'] = [int(x.date().month / 4) for x in date_list]
    transed_data['10_day'] = [int(x.date().day / 10) for x in date_list]
    transed_data['week'] = [x.date().isoweekday() for x in date_list]
    mlb.fit([[x] for x in data['AgeuponOutcome']])
    num = len(raw_data)
    for i in mlb.classes_:
        key = 'AgeuponOutcome_' + i.replace(' ', '_')
        if key not in referer:
            continue
        ids = raw_data.AgeuponOutcome == i
        transed_data[key] = [0] * num
        transed_data[key][ids] = 1
    
    mlb.fit([[x] for x in data['SexuponOutcome']])
    for i in mlb.classes_:
        key = 'SexuponOutcome_' + i.replace(' ', '_')
        if key not in referer:
            continue
        ids = raw_data.SexuponOutcome == i
        transed_data[key] = [0] * num
        transed_data[key][ids] = 1
    
    
    mlb.fit([[x] for x in data['Breed']])
    for i in mlb.classes_:
        key = 'Breed_' + i.replace(' ', '_')
        if key not in referer:
            continue
        ids = raw_data.Breed == i
        transed_data[key] = [0] * num
        transed_data[key][ids] = 1
    
    mlb.fit([[x] for x in data['Color']])
    for i in mlb.classes_:
        key = 'Color_' + i.replace(' ', '_')
        if key not in referer:
            continue
        ids = raw_data.Color == i
        transed_data[key] = [0] * num
        transed_data[key][ids] = 1
    
    for i in referer:
        if i not in transed_data and i != "OutcomeType":
            transed_data[i] = np.zeros((num, 1)).astype(int)
    transed_data['is_breed_mix'] = data.Breed.apply(lambda x: int(x.find('Mix') > 0))
    transed_data['is_color_mix'] = data.Color.apply(lambda x: int(x.find('/') > 0))

In [9]:
high_deg_thre=20
import pdb
def high_deg(raw_data, transed_data): 
    combiner={}
    for i, j in zip(raw_data.Breed, transed_data.Age_weeks):
        key = i + '$$' + str(j)
        if key in combiner:
            combiner[key] += 1
        else:
            combiner[key] = 1
        
    combiner = sorted(combiner.items(), key=lambda x:x[1], reverse=True)
    key_list = []
    for it, value in combiner:
        if (value > high_deg_thre):
            key_list.append(it)
    del combiner
     
    for i in key_list:
        v1, v2 = i.split('$$')
        v2 = int(v2)
        transed_data[i] = list(map(lambda x,y:int(x==v1 and y==v2), raw_data.Breed, transed_data.Age_weeks))

In [10]:
gen_feature(data, train_data)

In [11]:
high_deg(data, train_data)

In [12]:
mnb = MultinomialNB()

In [13]:
data_to_predict = pandas.read_csv('test.csv')

In [14]:
data_test = pandas.DataFrame()
kf = KFold(len(train_data), 10, shuffle=True)
convert_feature(data_to_predict, data_test, train_data.columns)

In [15]:
def gen_output(res, file_name):
    le = LabelEncoder()
    le.fit(data['OutcomeType'])
    res_data = pandas.DataFrame()
    res_data['ID'] = data_to_predict.ID
    for i in range(0, 5):
        res_data[le.classes_[i]] = res[:,i]
    res_data.to_csv(file_name, index = False)

In [16]:
"""
from sklearn.naive_bayes import GaussianNB
gnb = BernoulliNB()
bi_train_data = train_data.copy()
del bi_train_data['month']
del bi_train_data['season']
del bi_train_data['10_day']
del bi_train_data['week']

tmp = mlb.fit_transform([[x] for x in train_data['month']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['month_' + str(i)] = val

tmp = mlb.fit_transform([[x] for x in train_data['season']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['season_' + str(i)] = val
    
tmp = mlb.fit_transform([[x] for x in train_data['10_day']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['10_day_' + str(i)] = val
    
tmp = mlb.fit_transform([[x] for x in train_data['week']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['week_' + str(i)] = val
del tmp

for train_idx, test_idx in kf:
    X_train = bi_train_data.iloc[train_idx]
    X_test = bi_train_data.iloc[test_idx]
    gnb.fit(X_train.iloc[:,1:], X_train.OutcomeType)
    w = gnb.predict_proba(X_test.iloc[:,1:])
    print(log_loss(X_test.OutcomeType, w))
"""

"\nfrom sklearn.naive_bayes import GaussianNB\ngnb = BernoulliNB()\nbi_train_data = train_data.copy()\ndel bi_train_data['month']\ndel bi_train_data['season']\ndel bi_train_data['10_day']\ndel bi_train_data['week']\n\ntmp = mlb.fit_transform([[x] for x in train_data['month']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['month_' + str(i)] = val\n\ntmp = mlb.fit_transform([[x] for x in train_data['season']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['season_' + str(i)] = val\n    \ntmp = mlb.fit_transform([[x] for x in train_data['10_day']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['10_day_' + str(i)] = val\n    \ntmp = mlb.fit_transform([[x] for x in train_data['week']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['week_' + str(i)] = val\ndel tmp\n\nfor train_idx, test_idx in kf:\n    X_train = bi_train_data.iloc[train_idx]\n    X_test = bi_train_data.iloc[test_idx]\n    gnb.fit(X_train.iloc[:,1:], X_train.OutcomeType

In [23]:
predicts = None
n_jobs = 3
def model_fit(alg, dtrain, predictors, cv_folds=5, early_stopping_rounds=50):
    xgb_param = alg.get_xgb_params()
    xgb_param['num_class'] = 5
    #Fit the algorithm on the data
    kf = KFold(len(dtrain), cv_folds, shuffle=True)
    for train_idx, test_idx in kf:
        X_train = dtrain.iloc[train_idx]
        X_test = dtrain.iloc[test_idx]
        alg.fit(X_train.iloc[:,1:],
                X_train.OutcomeType, 
                eval_metric='mlogloss',
               verbose=True)
        try:
            print(alg.evals_result())
        except:
            pass
        #Predict training set:
        dtrain_predictions = alg.predict(X_test.iloc[:,1:])
        global predicts
        predicts = alg.predict_proba(X_test.iloc[:,1:])
        #Print model report:
        print("Model Report")
        print("Accuracy : %.4g" % accuracy_score(X_test.OutcomeType, dtrain_predictions))
        print("LogLoss : %.4g" % log_loss(X_test.OutcomeType, predicts))
        break
                            

In [41]:
import xgboost as xgb
evals_res = {}
base_model = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 nthread=6,
 scale_pos_weight=1,
 seed=27)


In [42]:
model_fit(base_model, train_data, None)

Model Report
Accuracy : 0.6339
LogLoss : 0.8979


In [43]:
param_test1 = {
    'max_depth' : list(range(3, 10, 2)),
    'min_child_weight' : list(range(1, 6, 2))
}

gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        gamma=0,
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'multi:softprob', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=27), 
        param_grid = param_test1, 
        scoring='log_loss',
        n_jobs=3,
        iid=False,
        cv=5)
gsearch1.fit(train_data.iloc[:,1:],train_data.OutcomeType)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -0.86359, std: 0.00677, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: -0.86418, std: 0.00679, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: -0.86422, std: 0.00667, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: -0.85787, std: 0.00617, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: -0.85789, std: 0.00631, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: -0.85861, std: 0.00670, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: -0.86006, std: 0.00555, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: -0.85998, std: 0.00582, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: -0.85947, std: 0.00621, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: -0.86370, std: 0.00668, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: -0.86291, std: 0.00615, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: -0.86101, std: 0.00592, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 5, 'min_chil

In [45]:
param_test1 = {
    'max_depth' : [4,5,6],
    'min_child_weight' : [1,2,3]
}

gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140,  
        gamma=0,
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'multi:softprob', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=27), 
        param_grid = param_test1, 
        scoring='log_loss',
        n_jobs=n_jobs,
        iid=False,
        cv=5)
gsearch1.fit(train_data.iloc[:,1:],train_data.OutcomeType)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -0.85917, std: 0.00642, params: {'min_child_weight': 1, 'max_depth': 4},
  mean: -0.85902, std: 0.00658, params: {'min_child_weight': 2, 'max_depth': 4},
  mean: -0.85918, std: 0.00670, params: {'min_child_weight': 3, 'max_depth': 4},
  mean: -0.85787, std: 0.00617, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: -0.85794, std: 0.00612, params: {'min_child_weight': 2, 'max_depth': 5},
  mean: -0.85789, std: 0.00631, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: -0.85821, std: 0.00579, params: {'min_child_weight': 1, 'max_depth': 6},
  mean: -0.85880, std: 0.00596, params: {'min_child_weight': 2, 'max_depth': 6},
  mean: -0.85854, std: 0.00625, params: {'min_child_weight': 3, 'max_depth': 6}],
 {'max_depth': 5, 'min_child_weight': 1},
 -0.85787098134490736)

In [21]:
param_test3 = {
 'gamma':[i/10.0 for i in range(1,10)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        max_depth=5,
        min_child_weight=1, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'multi:softprob', 
        nthread=5, 
        scale_pos_weight=1,
        seed=27), 
        param_grid = param_test3, 
        scoring='log_loss',
        n_jobs=3,
        iid=False, 
        cv=5)
gsearch3.fit(train_data.iloc[:,1:],train_data.OutcomeType)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: -0.85806, std: 0.00580, params: {'gamma': 0.1},
  mean: -0.85844, std: 0.00575, params: {'gamma': 0.2},
  mean: -0.85828, std: 0.00585, params: {'gamma': 0.3},
  mean: -0.85821, std: 0.00584, params: {'gamma': 0.4},
  mean: -0.85834, std: 0.00570, params: {'gamma': 0.5},
  mean: -0.85826, std: 0.00585, params: {'gamma': 0.6},
  mean: -0.85808, std: 0.00575, params: {'gamma': 0.7},
  mean: -0.85828, std: 0.00580, params: {'gamma': 0.8},
  mean: -0.85808, std: 0.00610, params: {'gamma': 0.9}],
 {'gamma': 0.1},
 -0.85805963432154253)

In [24]:
param_test3 = {
 'subsample':[i/100.0 for i in range(84, 90, 1)],
}

gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        max_depth=6,
        min_child_weight=3, 
        gamma=0.1, 
        objective= 'multi:softprob', 
        nthread=4, 
        colsample_bytree=0.8,
        scale_pos_weight=1,
        seed=27), 
        param_grid = param_test3, 
        scoring='log_loss',
        n_jobs=n_jobs,
        iid=False, 
        cv=5)
gsearch3.fit(train_data.iloc[:,1:],train_data.OutcomeType)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: -0.85846, std: 0.00597, params: {'subsample': 0.8},
  mean: -0.85782, std: 0.00600, params: {'subsample': 0.81},
  mean: -0.85824, std: 0.00578, params: {'subsample': 0.82},
  mean: -0.85827, std: 0.00555, params: {'subsample': 0.83},
  mean: -0.85768, std: 0.00572, params: {'subsample': 0.84}],
 {'subsample': 0.84},
 -0.85768184353624477)

In [22]:
import xgboost as xgb
modified_model = xgb.XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=140, 
        max_depth=5,
        min_child_weight=1, 
        gamma=1.3, 
        objective= 'multi:softprob', 
        nthread=4, 
        colsample_bytree=0.8,
        subsample = 0.82,
        scale_pos_weight=1,
        seed=27)

In [23]:
model_fit(modified_model, train_data, None)

Model Report
Accuracy : 0.651
LogLoss : 0.8541


In [32]:
data_data = pandas.DataFrame()
for i in train_data.columns:
    if i == 'OutcomeType':
        continue
    data_data[i] = data_test[i]
res = modified_model.predict_proba(data_data)
res_data = pandas.DataFrame()
le.fit_transform(data['OutcomeType'])
res_data['ID'] = data_to_predict.ID
for i in range(0,5):
    res_data[le.classes_[i]] = res[:,i]
res_data.to_csv('modified_xgboost.csv', index=False)

In [27]:
train_data.columns

Index(['OutcomeType', 'AnimalType', 'month', 'season', '10_day', 'week',
       'Age_days', 'AgeuponOutcome_0_years', 'AgeuponOutcome_1_day',
       'AgeuponOutcome_1_month',
       ...
       'Color_White/Cream', 'Color_White/Gray', 'Color_White/Orange_Tabby',
       'Color_White/Red', 'Color_White/Tan', 'Color_White/Tricolor',
       'Color_Yellow', 'Color_Yellow/White', 'is_breed_mix', 'is_color_mix'],
      dtype='object', length=212)

In [28]:
data_test.columns

Index(['AnimalType', 'Age_days', 'month', 'season', '10_day', 'week',
       'AgeuponOutcome_0_years', 'AgeuponOutcome_1_day',
       'AgeuponOutcome_1_month', 'AgeuponOutcome_1_week',
       ...
       'Color_White/Cream', 'Color_White/Gray', 'Color_White/Orange_Tabby',
       'Color_White/Red', 'Color_White/Tan', 'Color_White/Tricolor',
       'Color_Yellow', 'Color_Yellow/White', 'is_breed_mix', 'is_color_mix'],
      dtype='object', length=211)