In [1]:
import pandas
import datetime
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import *

In [2]:
data = pandas.read_csv('train.csv')

In [3]:
mlb = MultiLabelBinarizer()
le = LabelEncoder()

In [4]:
breed_count = data.Breed.value_counts()
color_count = data.Color.value_counts()

In [5]:
def parse_age(x):
    time, resolution = x.split(' ')
    time = int(time)
    if resolution.startswith('day'):
        return int(time)
    elif resolution.startswith('week'):
        return int(time * 7)
    elif resolution.startswith('month'):
        return int(time * 30)
    elif resolution.startswith('year'):
        return int(time * 365)
    else:
        print(x)

In [6]:
try:
    del train_data
except:
    pass
train_data = pandas.DataFrame()


def gen_feature(raw_data, transed_data):
    if 'OutcomeType' in raw_data:
        transed_data['OutcomeType'] = le.fit_transform(raw_data['OutcomeType'])
    transed_data['AnimalType'] = le.fit_transform(raw_data['AnimalType'])
    #众数填充
    raw_data['SexuponOutcome'].fillna(inplace = True, value = raw_data['SexuponOutcome'].describe().top)
    raw_data['AgeuponOutcome'].fillna(inplace = True, value = raw_data['AgeuponOutcome'].describe().top)
    date_list = [datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S") for x in raw_data['DateTime']]
    transed_data['month'] = [x.date().month for x in date_list]
    transed_data['season'] = [int(x.date().month / 4) for x in date_list]
    transed_data['10_day'] = [int(x.date().day / 10) for x in date_list]
    transed_data['week'] = [x.date().isoweekday() for x in date_list]
    train_data['Age_days'] = data.AgeuponOutcome.apply(parse_age) 
    age_cat = mlb.fit_transform([[x] for x in raw_data['AgeuponOutcome']])
    for i, val in zip(mlb.classes_, age_cat.T):
        transed_data['AgeuponOutcome_' + i.replace(' ','_')] = val
    del age_cat
    
    sex_cat = mlb.fit_transform([[x] for x in raw_data['SexuponOutcome']])
    for i, val in zip(mlb.classes_, sex_cat.T):
        transed_data['SexuponOutcome_' + i.replace(' ','_')] = val
    del sex_cat
   
    breed_cat = mlb.fit_transform([[x] for x in raw_data['Breed']])
    for i, val in zip(mlb.classes_, breed_cat.T):
        if breed_count.loc[i] > 30:
            transed_data['Breed_' + i.replace(' ', '_')] = val
    del breed_cat
    color_cat = mlb.fit_transform([[x] for x in raw_data['Color']])
    for i, val in zip(mlb.classes_, color_cat.T):
        if color_count.loc[i] > 30:
            transed_data['Color_' + i.replace(' ','_')] = val
    del color_cat
   
    transed_data['is_breed_mix'] = data.Breed.apply(lambda x: int(x.find('Mix') > 0))
    transed_data['is_color_mix'] = data.Color.apply(lambda x: int(x.find('/') > 0))


In [49]:
def convert_feature(raw_data, transed_data, referer):
    transed_data['AnimalType'] = le.fit_transform(raw_data['AnimalType'])
    #众数填充
    raw_data['SexuponOutcome'].fillna(inplace = True, value = raw_data['SexuponOutcome'].describe().top)
    raw_data['AgeuponOutcome'].fillna(inplace = True, value = raw_data['AgeuponOutcome'].describe().top)
    date_list = [datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S") for x in raw_data['DateTime']]
    train_data['Age_days'] = data.AgeuponOutcome.apply(parse_age) 
    transed_data['month'] = [x.date().month for x in date_list]
    transed_data['season'] = [int(x.date().month / 4) for x in date_list]
    transed_data['10_day'] = [int(x.date().day / 10) for x in date_list]
    transed_data['week'] = [x.date().isoweekday() for x in date_list]
    age_cat = mlb.fit_transform([[x] for x in raw_data['AgeuponOutcome']])
    for i, val in zip(mlb.classes_, age_cat.T):
        if (i in referer):
            transed_data['AgeuponOutcome_' + i.replace(' ','_')] = val
    del age_cat
    
    sex_cat = mlb.fit_transform([[x] for x in raw_data['SexuponOutcome']])
    for i, val in zip(mlb.classes_, sex_cat.T):
        if (i in referer):
            transed_data['SexuponOutcome_' + i.replace(' ','_')] = val
    del sex_cat
   
    breed_cat = mlb.fit_transform([[x] for x in raw_data['Breed']])
    for i, val in zip(mlb.classes_, breed_cat.T):
        if (i in referer):
            transed_data['Breed_' + i.replace(' ', '_')] = val
    del breed_cat
    color_cat = mlb.fit_transform([[x] for x in raw_data['Color']])
    for i, val in zip(mlb.classes_, color_cat.T):
        if (i in referer):
            transed_data['Color_' + i.replace(' ','_')] = val
    del color_cat
    
    num = len(transed_data)
    for i in referer:
        if i not in transed_data and i != "OutcomeType":
            transed_data[i] = np.zeros((num, 1)).astype(int)
    transed_data['is_breed_mix'] = data.Breed.apply(lambda x: int(x.find('Mix') > 0))
    transed_data['is_color_mix'] = data.Color.apply(lambda x: int(x.find('/') > 0))

In [8]:
gen_feature(data, train_data)

In [9]:
train_data.head()

Unnamed: 0,OutcomeType,AnimalType,month,season,10_day,week,Age_days,AgeuponOutcome_0_years,AgeuponOutcome_1_day,AgeuponOutcome_1_month,...,Color_White/Cream,Color_White/Gray,Color_White/Orange_Tabby,Color_White/Red,Color_White/Tan,Color_White/Tricolor,Color_Yellow,Color_Yellow/White,is_breed_mix,is_color_mix
0,3,1,2,0,1,3,365,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,2,0,10,2,1,7,365,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,1,0,3,6,730,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,4,0,7,1,1,5,21,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,1,11,2,1,5,730,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
mnb = MultinomialNB()

In [11]:
kf = KFold(len(train_data), 10, shuffle=True)

In [12]:
data_to_predict = pandas.read_csv('test.csv')

In [50]:
data_test = pandas.DataFrame()
kf = KFold(len(train_data), 10, shuffle=True)
convert_feature(data_to_predict, data_test, train_data.columns)


In [14]:
#result 1.2
res_data = pandas.DataFrame()
le.fit_transform(data['OutcomeType'])
res_data['ID'] = data_to_predict.ID
for i in range(0,5):
    res_data[le.classes_[i]] = res[:,i]
res_data.to_csv('base_ans.csv', index=False)

NameError: name 'res' is not defined

In [123]:
"""
from sklearn.naive_bayes import GaussianNB
gnb = BernoulliNB()
bi_train_data = train_data.copy()
del bi_train_data['month']
del bi_train_data['season']
del bi_train_data['10_day']
del bi_train_data['week']

tmp = mlb.fit_transform([[x] for x in train_data['month']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['month_' + str(i)] = val

tmp = mlb.fit_transform([[x] for x in train_data['season']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['season_' + str(i)] = val
    
tmp = mlb.fit_transform([[x] for x in train_data['10_day']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['10_day_' + str(i)] = val
    
tmp = mlb.fit_transform([[x] for x in train_data['week']])
for i, val in zip(mlb.classes_, tmp.T):
    bi_train_data['week_' + str(i)] = val
del tmp

for train_idx, test_idx in kf:
    X_train = bi_train_data.iloc[train_idx]
    X_test = bi_train_data.iloc[test_idx]
    gnb.fit(X_train.iloc[:,1:], X_train.OutcomeType)
    w = gnb.predict_proba(X_test.iloc[:,1:])
    print(log_loss(X_test.OutcomeType, w))
"""

"\nfrom sklearn.naive_bayes import GaussianNB\ngnb = BernoulliNB()\nbi_train_data = train_data.copy()\ndel bi_train_data['month']\ndel bi_train_data['season']\ndel bi_train_data['10_day']\ndel bi_train_data['week']\n\ntmp = mlb.fit_transform([[x] for x in train_data['month']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['month_' + str(i)] = val\n\ntmp = mlb.fit_transform([[x] for x in train_data['season']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['season_' + str(i)] = val\n    \ntmp = mlb.fit_transform([[x] for x in train_data['10_day']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['10_day_' + str(i)] = val\n    \ntmp = mlb.fit_transform([[x] for x in train_data['week']])\nfor i, val in zip(mlb.classes_, tmp.T):\n    bi_train_data['week_' + str(i)] = val\ndel tmp\n\nfor train_idx, test_idx in kf:\n    X_train = bi_train_data.iloc[train_idx]\n    X_test = bi_train_data.iloc[test_idx]\n    gnb.fit(X_train.iloc[:,1:], X_train.OutcomeType

In [18]:
predicts = None
X_trian = None
X_test = None
def model_fit(alg, dtrain, predictors, cv_folds=5, early_stopping_rounds=50):
    xgb_param = alg.get_xgb_params()
    xgb_param['num_class'] = 5
    #Fit the algorithm on the data
    kf = KFold(len(dtrain), cv_folds, shuffle=True)
    for train_idx, test_idx in kf:
        global X_train, X_test
        X_train = dtrain.iloc[train_idx]
        X_test = dtrain.iloc[test_idx]
        alg.fit(X_train.iloc[:,1:],
                X_train.OutcomeType, 
                eval_metric='mlogloss',
               verbose=True)

        #Predict training set:
        dtrain_predictions = alg.predict(X_test.iloc[:,1:])
        global predicts
        predicts = alg.predict_proba(X_test.iloc[:,1:])
        #Print model report:
        print("Model Report")
        print("Accuracy : %.4g" % accuracy_score(X_test.OutcomeType, dtrain_predictions))
        print("LogLoss : %.4g" % log_loss(X_test.OutcomeType, predicts))
        break
                            

In [42]:
import xgboost as xgb
evals_res = {}
base_model = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 nthread=4,
 scale_pos_weight=1,
 seed=27)


In [46]:
model_fit(base_model, train_data, None)

  preds = preds.reshape(nrow, preds.size / nrow)


Model Report
Accuracy : 0.6382


ValueError: Found arrays with inconsistent numbers of samples: [   5 5346]

In [47]:
print("LogLoss : %.4g" % log_loss(X_test.OutcomeType, predicts))

LogLoss : 0.8889


In [54]:
base_xgb_res = base_model.predict_proba(data_test)

  preds = preds.reshape(nrow, preds.size / nrow)


In [58]:
res_data = pandas.DataFrame()
le.fit_transform(raw_data['OutcomeType'])
res_data['ID'] = data_to_predict.ID
for i in range(0,5):
    res_data[le.classes_[i]] = base_xgb_res[:,i]
res_data.to_csv('base_xgboost.csv', index=False)

NameError: name 'raw_data' is not defined