Build logistic regression model based on cluster percentages under each restaurant  
and predict restaurant labels

Clusters are predicted by k-means on image deep learning features

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf

import numpy as np
import pandas as pd
import os, tempfile
import boto
import datetime
from collections import Counter

from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

In [2]:
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from pyspark.mllib.clustering import KMeans, KMeansModel

In [3]:
# AWS S3 credentials:

AWS_KEY = ""
AWS_SECRET = ""
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET)

In [4]:
# read saved k-means model
path = 's3n://amlyelp/subset/model/kmeans/deep_feature_2016-03-01_19_20_50.068026/'
clusters = KMeansModel.load(sc, path)

In [5]:
# read training photo_id, restaurant_id, labels
label_path = 's3n://amlyelp/subset/pic_label_subbybus_train.csv'
photo_res_label = sc.textFile(label_path)
first_line = photo_res_label.take(1)[0]
photo_res_label = photo_res_label.filter(lambda x: x!= first_line).map(lambda x: x.split(','))
photo_res_label.take(1)

[[u'5', u'3305', u'0', u'0', u'0', u'1', u'0', u'0', u'1', u'0', u'0']]

In [6]:
# extract training res_id, labels
res_label = photo_res_label.map(lambda x: (x[1], x[2:])).combineByKey(lambda x: x,
                                           lambda u, v: u,
                                           lambda u1,u2: u1)
res_label.take(2)

[(u'216', [u'1', u'0', u'0', u'1', u'0', u'1', u'0', u'0', u'1']),
 (u'1788', [u'1', u'0', u'0', u'0', u'0', u'1', u'0', u'0', u'1'])]

In [7]:
# read training picture and clusters
cluster_path = 's3n://amlyelp/subset/image_cluster_result/image_cluster_deep_feature_2016-03-01_19_21_17_105238/'
# cluster_path = 's3n://amlyelp/subset/image_cluster_result/image_cluster_deep_feature_2016-03-01_19_21_17_105238/'

photo_cluster_map = sc.textFile(cluster_path)

In [8]:
# parse training picture clusters
photo_cluster_map = photo_cluster_map.map(lambda x: x.replace("(u",'').replace("'",'').replace(')','').split(','))
photo_cluster_map = photo_cluster_map.map(lambda x: (x[0].strip(), x[1].strip()))
photo_cluster_map.take(2)

[(u'448995', u'32'), (u'414570', u'29')]

In [9]:
# pass picture cluster to restaurant
res_cluster = photo_res_label.map(lambda x: (x[0],x[1])).leftOuterJoin(photo_cluster_map).map(lambda x: x[1])
res_cluster.take(2)

[(u'227', u'35'), (u'3485', u'10')]

In [10]:
# combine clusters by restaurant and 
# calculate ratio of each picture cluster under each restaurant as features
agg_res_cluster = res_cluster.combineByKey(lambda x: [x],
                                           lambda u, v: u+[v],
                                           lambda u1,u2: u1+u2)
agg_res_cluster = agg_res_cluster.mapValues(lambda x: Counter(x))\
                                .map(lambda x: (x[0], {int(k):float(v)/np.sum(x[1].values()) for k,v in x[1].iteritems()}))
agg_res_cluster.take(1)

[(u'3775',
  {0: 0.04,
   5: 0.16,
   8: 0.04,
   12: 0.04,
   13: 0.04,
   14: 0.04,
   16: 0.04,
   17: 0.04,
   19: 0.08,
   24: 0.24,
   31: 0.04,
   34: 0.04,
   40: 0.04,
   43: 0.04,
   46: 0.04,
   47: 0.04})]

In [11]:
# merge cluster ratio feature and labels
res_cluster_label = agg_res_cluster.leftOuterJoin(res_label).map(lambda x: (x[0],x[1][0],x[1][1]))
res_cluster_label.take(1)

[(u'590',
  {0: 0.07692307692307693,
   2: 0.02564102564102564,
   4: 0.02564102564102564,
   5: 0.07692307692307693,
   8: 0.02564102564102564,
   9: 0.05128205128205128,
   12: 0.02564102564102564,
   14: 0.02564102564102564,
   18: 0.02564102564102564,
   20: 0.05128205128205128,
   23: 0.1282051282051282,
   25: 0.10256410256410256,
   29: 0.02564102564102564,
   30: 0.02564102564102564,
   31: 0.02564102564102564,
   41: 0.07692307692307693,
   44: 0.05128205128205128,
   45: 0.10256410256410256,
   46: 0.02564102564102564,
   49: 0.02564102564102564},
  [u'1', u'1', u'0', u'0', u'0', u'1', u'1', u'0', u'1'])]

In [12]:
res_cluster_label.count()

500

In [13]:
# read test set deep learning feature
test_feature_path = 's3n://amlyelp/subset/test_subset_deep_feature.csv'
test_feature = sc.textFile(test_feature_path)
test_first = test_feature.take(1)[0]
test_feature = test_feature.filter(lambda x: x!=test_first)

test_feature_parsed = test_feature.map(lambda x: x.replace('[','').replace(']','').replace('"','').split(','))\
                  .map(lambda x: (x[0], np.array(map(float, x[1].split(' ')))))
test_feature_parsed.take(1)

[(u'245805',
  array([ 0.      ,  0.35613 ,  0.      , ...,  0.      ,  0.437566,  0.      ]))]

In [14]:
# predict test picture clusters
test_image_cluster = test_feature_parsed.map(lambda x: (x[0],clusters.predict(x[1])))
test_image_cluster.take(2)

[(u'245805', 30), (u'363726', 49)]

In [15]:
# read test photo_id, res_id, labels
test_label_path = 's3n://amlyelp/subset/pic_label_subbybus_test.csv'
test_photo_res_label = sc.textFile(test_label_path)
test_first_line = test_photo_res_label.take(1)[0]
test_photo_res_label = test_photo_res_label.filter(lambda x: x!= test_first_line).map(lambda x: x.split(','))
test_photo_res_label.take(1)

[[u'80', u'1114', u'0', u'1', u'1', u'0', u'1', u'1', u'1', u'1', u'0']]

In [16]:
# extract test res_id, labels
test_res_label = test_photo_res_label.map(lambda x: (x[1], x[2:])).combineByKey(lambda x: x,
                                           lambda u, v: u,
                                           lambda u1,u2: u1)
test_res_label.take(1)

[(u'881', [u'1', u'1', u'0', u'1', u'0', u'1', u'0', u'0', u'1'])]

In [17]:
# pass test picture cluster to restaurant
test_res_cluster = test_photo_res_label.map(lambda x: (x[0],x[1])).leftOuterJoin(test_image_cluster).map(lambda x: x[1])
test_res_cluster.take(3)

[(u'1160', 48), (u'485', 16), (u'1114', 26)]

In [18]:
# combine clusters by restaurant and 
# calculate ratio of each picture cluster under each restaurant as features
test_agg_res_cluster = test_res_cluster.combineByKey(lambda x: [x],
                                           lambda u, v: u+[v],
                                           lambda u1,u2: u1+u2)\
                                .mapValues(lambda x: Counter(x))\
                                .map(lambda x: (x[0], {int(k):float(v)/np.sum(x[1].values()) for k,v in x[1].iteritems()}))

In [19]:
# merge test cluster ratio feature and labels
test_res_cluster_label = test_agg_res_cluster.leftOuterJoin(test_res_label).map(lambda x: (x[0],x[1][0],x[1][1]))

In [20]:
# train logistic regression based on percentage of cluster under each restaurant
# make prediction on test set and show f1 score
prediction = pd.DataFrame(test_res_cluster_label.map(lambda x: x[0]).collect(),columns=['restaurant_id'])
for i in range(9):
    train = res_cluster_label.map(lambda x: LabeledPoint(int(x[2][i]), SparseVector(50, x[1]))).cache()
    test = test_res_cluster_label.map(lambda x: LabeledPoint(int(x[2][i]), SparseVector(50, x[1]))).cache()
    lrm = LogisticRegressionWithLBFGS.train(train, intercept=True, validateData=False)

    labelsAndPreds = zip(*test.map(lambda p: (lrm.predict(p.features), p.label)).collect())

    current_f1 = f1_score(y_true=labelsAndPreds[1], y_pred=labelsAndPreds[0])
    current_accuracy = accuracy_score(y_true=labelsAndPreds[1], y_pred=labelsAndPreds[0])
    prediction[('label_%d' % i)]=pd.Series(labelsAndPreds[0])
    
#     # if predict probability instead of labels
#     lrm.clearThreshold()
#     labelsAndProbs = zip(*test.map(lambda p: (lrm.predict(p.features), p.label)).collect())
#     current_auc = roc_auc_score(y_true=labelsAndProbs[1], y_score=labelsAndProbs[0])
    
    print i, current_accuracy, current_f1

0 0.777777777778 0.666666666667
1 0.828282828283 0.841121495327
2 0.79797979798 0.818181818182
3 0.676767676768 0.589743589744
4 0.868686868687 0.793650793651
5 0.79797979798 0.857142857143
6 0.787878787879 0.857142857143
7 0.79797979798 0.72972972973
8 0.838383838384 0.859649122807


In [31]:
# combined f1 score for all labels
def mlb_f1(y_true, y_pred):
    y_true = set(y_true)
    y_pred = set(y_pred)
    tp = len(y_true & y_pred)
    fp = len(y_pred-y_true)
    fn = len(y_true-y_pred)
    p = float(tp)/(tp+fp)
    r = float(tp)/(tp+fn)
    return 2*p*r/(p+r) if tp!=0 else 0.

def mean_f1(y_true, y_pred):
    score_list = map(lambda x: mlb_f1(x[0],x[1]), zip(y_true, y_pred))
    return sum(score_list)/len(score_list)

In [40]:
y_pred = [[i for i in range(len(arr)) if arr[i]!=0] for arr in np.array(prediction.iloc[:,1:])]
y_true_array = np.array(test_res_cluster_label.map(lambda x: x[2]).collect()).astype(int)
y_true = [[i for i in range(len(arr)) if arr[i]!=0] for arr in y_true_array]

In [41]:
# overall f1 score
mean_f1(y_true, y_pred)

0.7235923448044658

In [29]:
# show w/ and w/o label ratio in test set
test_labels_array = np.array(test_res_cluster_label.map(lambda x: x[2]).collect())
test_labels_count = [Counter(arr) for arr in test_labels_array.T]
test_labels_freq = [{k:float(v)/np.sum(dict_.values()) for k,v in dict_.iteritems()} for dict_ in test_labels_count]

In [30]:
test_labels_freq

[{u'0': 0.6363636363636364, u'1': 0.36363636363636365},
 {u'0': 0.43434343434343436, u'1': 0.5656565656565656},
 {u'0': 0.45454545454545453, u'1': 0.5454545454545454},
 {u'0': 0.5151515151515151, u'1': 0.48484848484848486},
 {u'0': 0.6868686868686869, u'1': 0.31313131313131315},
 {u'0': 0.32323232323232326, u'1': 0.6767676767676768},
 {u'0': 0.25252525252525254, u'1': 0.7474747474747475},
 {u'0': 0.6161616161616161, u'1': 0.3838383838383838},
 {u'0': 0.42424242424242425, u'1': 0.5757575757575758}]