Build logistic regression model using aggregated deep learning features  
Deep learning feature are extracted from each image with pretrained AlexNet

In [29]:
import pandas as pd
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.svm import SVC

In [31]:
# read aggregated restaurant level deep learning features
res_feature_train = pd.read_csv(r'/res_deep_avg/feature_avg_train.csv',header=None)

In [32]:
res_feature_train.rename(columns={0:'restaurant'}, inplace=True)
res_feature_train[:3]

Unnamed: 0,restaurant,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096
0,2543,3.102137,4.487439,4.306456,3.356702,10.010436,10.902914,2.709549,2.561266,1.81099,...,6.122845,5.99863,7.808895,7.701561,9.571619,0.778068,5.207161,2.388407,2.398979,1.816032
1,3580,5.287529,4.185751,3.64574,6.329757,5.535403,3.977552,6.698079,7.391171,3.274138,...,4.463264,5.905305,9.519002,6.447963,5.724946,4.363743,3.568336,2.30606,1.239113,2.156794
2,3232,4.284318,5.195933,4.735827,5.612554,7.924834,6.714575,10.962032,6.058934,4.670422,...,5.934104,8.936571,13.937132,13.483907,9.842915,7.410868,4.746403,5.002751,8.546763,5.607031


In [33]:
res_label=pd.read_csv(r'/res_label_train.csv')
res_label[:3]

Unnamed: 0,restaurant,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8
0,1000,0,1,1,1,1,1,1,1,0
1,1001,1,1,0,0,0,0,1,0,1
2,100,0,1,1,0,1,1,1,1,0


In [34]:
# merge feature and label
res_df=res_feature_train.merge(res_label,on='restaurant')
res_df[:3]

Unnamed: 0,restaurant,1,2,3,4,5,6,7,8,9,...,4096,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8
0,2543,3.102137,4.487439,4.306456,3.356702,10.010436,10.902914,2.709549,2.561266,1.81099,...,1.816032,1,1,1,1,0,1,1,1,1
1,3580,5.287529,4.185751,3.64574,6.329757,5.535403,3.977552,6.698079,7.391171,3.274138,...,2.156794,0,0,1,0,0,1,0,1,0
2,3232,4.284318,5.195933,4.735827,5.612554,7.924834,6.714575,10.962032,6.058934,4.670422,...,5.607031,0,1,1,0,1,1,1,1,0


In [35]:
train_x = res_df.iloc[:,range(1,4097)]

In [79]:
logit_model = LogisticRegression(penalty='l2')

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
clf = GridSearchCV(estimator=LogisticRegression(penalty='l2'),
        param_grid=param_grid,
        scoring='f1',
        cv=10,
        verbose=51)

In [52]:
# grid search regularization parameter 
label_id=0

train_y=res_df['label_%d'%label_id]

clf.fit(train_x,train_y)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] C=0.001 .........................................................
[CV] ................................ C=0.001, score=0.710744 -   1.1s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    1.1s
[CV] C=0.001 .........................................................
[CV] ................................ C=0.001, score=0.571429 -   1.0s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:    2.2s
[CV] C=0.001 .........................................................
[CV] ................................ C=0.001, score=0.614173 -   1.0s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:    3.4s
[CV] C=0.001 .........................................................
[CV] ................................ C=0.001, score=0.603448 -   1.0s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    4.5s
[CV] C=0.001 .........................................................
[CV] ................................ C=0.001, score

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=51)

In [53]:
clf.best_params_

{'C': 0.1}

In [54]:
clf.grid_scores_

[mean: 0.64174, std: 0.04021, params: {'C': 0.001},
 mean: 0.67381, std: 0.02453, params: {'C': 0.01},
 mean: 0.67431, std: 0.02649, params: {'C': 0.1},
 mean: 0.65220, std: 0.02159, params: {'C': 1},
 mean: 0.63987, std: 0.02359, params: {'C': 10},
 mean: 0.64181, std: 0.02356, params: {'C': 100},
 mean: 0.64181, std: 0.02690, params: {'C': 1000}]

In [58]:
print clf.best_params_
print clf.grid_scores_

{'C': 0.05}
[mean: 0.67381, std: 0.02453, params: {'C': 0.01}, mean: 0.68038, std: 0.02311, params: {'C': 0.05}, mean: 0.67431, std: 0.02649, params: {'C': 0.1}, mean: 0.66404, std: 0.01499, params: {'C': 0.5}, mean: 0.65220, std: 0.02159, params: {'C': 1}]


In [36]:
logit_model = LogisticRegression(penalty='l2')

param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1]}
clf = GridSearchCV(estimator=LogisticRegression(penalty='l2'),
        param_grid=param_grid,
        scoring='f1',
        cv=10)

In [1]:
# grid search regularization parameter for all 
for label_id in range(9):
    train_y=res_df['label_%d'%label_id]
    clf.fit(train_x,train_y)
    print label_id, clf.best_params_, clf.best_score_

In [30]:
svm_model = SVC()

param_grid = {'C': [5,10,15,20]}
clf = GridSearchCV(estimator=svm_model,
        param_grid=param_grid,
        scoring='f1',
        cv=5)

for label_id in range(5,9):
    train_y=res_df['label_%d'%label_id]
    clf.fit(train_x,train_y)
    print label_id, clf.best_params_, clf.best_score_

5 {'C': 10} 0.895478850972
6 {'C': 15} 0.9421782506
7 {'C': 15} 0.777447640937
8 {'C': 10} 0.895570024626


grid scores  
logistic:  
0 {'C': 0.01} 0.691231952593  
1 {'C': 0.01} 0.836146927697  
2 {'C': 0.01} 0.881455959262  
3 {'C': 0.05} 0.676540232023  
4 {'C': 0.05} 0.804412156732  
5 {'C': 0.01} 0.895222627512  
6 {'C': 0.01} 0.942190358071  
7 {'C': 0.1} 0.764079570558  
8 {'C': 0.01} 0.895877965623  
  
SVM:  
0 {'C': 10} 0.700756701769  
1 {'C': 10} 0.833505743338  
2 {'C': 10} 0.877418129961  
3 {'C': 10} 0.685908697984  
4 {'C': 10} 0.797564315132  
5 {'C': 10} 0.895478850972  
6 {'C': 10} 0.94097531048  
7 {'C': 10} 0.770340699241  
8 {'C': 10} 0.895570024626  

In [9]:
# read deep learning features of test set
res_feature_test = pd.read_csv(r'/res_deep_avg/feature_avg_test.csv',header=None)
res_feature_test.rename(columns={0:'restaurant'}, inplace=True)
res_feature_test[:3]

Unnamed: 0,restaurant,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096
0,8i7dh,0.070079,0.049189,0.142998,0.643171,0.763074,1.105472,0.290251,0.397025,0.319553,...,0.763321,0.58819,1.425873,2.337215,1.057303,0.333964,0.266503,0.384997,0.122773,0.435602
1,blxg3,0.167815,0.256773,0.185716,0.24597,0.780929,0.562616,0.244036,0.177635,0.142654,...,0.584998,0.546046,0.625016,1.988779,0.874456,0.208558,0.269952,0.279896,0.086104,0.009007
2,wzvzs,0.172079,0.229586,0.142795,0.396277,0.818319,0.697299,0.363702,0.255878,0.246695,...,0.689928,0.370871,0.734514,1.825779,1.087935,0.166965,0.265202,0.174693,0.188443,0.143127


In [10]:
test_x=res_feature_test.iloc[:,range(1,4097)]
test_x[:3]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096
0,0.070079,0.049189,0.142998,0.643171,0.763074,1.105472,0.290251,0.397025,0.319553,0.388232,...,0.763321,0.58819,1.425873,2.337215,1.057303,0.333964,0.266503,0.384997,0.122773,0.435602
1,0.167815,0.256773,0.185716,0.24597,0.780929,0.562616,0.244036,0.177635,0.142654,0.351615,...,0.584998,0.546046,0.625016,1.988779,0.874456,0.208558,0.269952,0.279896,0.086104,0.009007
2,0.172079,0.229586,0.142795,0.396277,0.818319,0.697299,0.363702,0.255878,0.246695,0.429182,...,0.689928,0.370871,0.734514,1.825779,1.087935,0.166965,0.265202,0.174693,0.188443,0.143127


In [11]:
# build logistic regression model for each label and make prediction on test set
C_list=[0.01,0.01,0.01,0.05,0.05,0.01,0.01,0.1,0.01]

label_id=0
logit_model = LogisticRegression(penalty='l2',C=C_list[label_id])

train_y=res_df['label_%d'%label_id]
logit_model.fit(train_x,train_y)

pred[label_id] = logit_model.predict(test_x).astype(np.str)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
output=res_feature_test[["restaurant"]].copy()

In [19]:
C_list=[0.01,0.01,0.01,0.05,0.05,0.01,0.01,0.1,0.01]
for label_id in range(9):
    logit_model = LogisticRegression(penalty='l2',C=C_list[label_id])

    train_y=res_df['label_%d'%label_id]
    logit_model.fit(train_x,train_y)
    y = logit_model.predict(test_x)
    pred_y = pd.DataFrame(y)
    pred_y.columns = ['label_%d' % label_id]
    output = output.join(pred_y)

In [26]:
output.iloc[ind,0]

'8i7dh'

In [27]:
# write to submission file
with open(r'/res_deep_avg/mean_submission.csv','w') as fw:
    fw.write('business_id,labels\n')
    for ind in output.index:
        pred = list(output.iloc[ind,1:])
        id=output.iloc[ind,0]
        label = [str(i) for i in range(9) if pred[i]!=0]
        line = id+','+' '.join(label)+'\n'
        fw.write(line)