Build logistic regression model based on cluster percentages under each restaurant  
and predict restaurant labels

Clusters are predicted by k-means on image color features

In [113]:
from pyspark import SparkContext
from pyspark import SparkConf

import numpy as np
import pandas as pd
import os, tempfile
import boto
import datetime
from collections import Counter

from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

In [114]:
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from pyspark.mllib.clustering import KMeans, KMeansModel

In [115]:
# AWS S3 credentials:

AWS_KEY = ""
AWS_SECRET = ""
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET)

In [116]:
from boto.s3.connection import S3Connection
from boto.s3.key import Key

AWS_KEY = ""
AWS_SECRET = ""

conn = S3Connection(AWS_KEY, AWS_SECRET, host='s3.amazonaws.com')
pb = conn.get_bucket('amlyelp')

k = Key(pb)

Read data and saved model

In [117]:
# read saved k-means model
path = 's3n://amlyelp/subset/model/kmeans/color_feature_3_2016-03-05_23_09_58.393414/'
clusters = KMeansModel.load(sc, path)

In [118]:
# read training picture and clusters
cluster_path = 's3n://amlyelp/subset/image_cluster_result/image_cluster_color_feature_3_2016-03-05_23_12_29_238662/'
# cluster_path = 's3n://amlyelp/subset/image_cluster_result/image_cluster_deep_feature_2016-03-01_19_21_17_105238/'

photo_cluster_map = sc.textFile(cluster_path)
# parse training picture clusters
photo_cluster_map = photo_cluster_map.map(lambda x: x.replace("(u",'').replace("'",'').replace(')','').split(','))
photo_cluster_map = photo_cluster_map.map(lambda x: (x[0].strip(), x[1].strip()))
photo_cluster_map.take(2)

[(u'28034', u'14'), (u'28863', u'31')]

In [119]:
photo_cluster_map.count()

61718

In [120]:
# read test set deep feature
test_feature_path = 's3n://amlyelp/subset/F3_testfinal.csv'
test_feature = sc.textFile(test_feature_path)

test_feature_parsed = test_feature.map(lambda x: x.split(','))\
                  .map(lambda x: (x[0], np.array(x[1].split(' ')).astype(np.float)))
test_feature_parsed.take(1)

[(u'80', array([ 0.,  0.,  0., ...,  0.,  0.,  0.]))]

Read labels

In [121]:
# read training photo_id, res_id, labels
label_path = 's3n://amlyelp/subset/pic_label_subbybus_train.csv'
photo_res_label = sc.textFile(label_path)
first_line = photo_res_label.take(1)[0]
photo_res_label = photo_res_label.filter(lambda x: x!= first_line).map(lambda x: x.split(','))
photo_res_label.take(1)

[[u'5', u'3305', u'0', u'0', u'0', u'1', u'0', u'0', u'1', u'0', u'0']]

In [122]:
# read test photo_id, res_id, labels
test_label_path = 's3n://amlyelp/subset/pic_label_subbybus_test.csv'
test_photo_res_label = sc.textFile(test_label_path)
test_first_line = test_photo_res_label.take(1)[0]
test_photo_res_label = test_photo_res_label.filter(lambda x: x!= test_first_line).map(lambda x: x.split(','))
test_photo_res_label.take(1)

[[u'80', u'1114', u'0', u'1', u'1', u'0', u'1', u'1', u'1', u'1', u'0']]

Process training data

In [123]:
# extract training res_id, labels
res_label = photo_res_label.map(lambda x: (x[1], x[2:])).combineByKey(lambda x: x,
                                           lambda u, v: u,
                                           lambda u1,u2: u1)
res_label.take(2)

[(u'216', [u'1', u'0', u'0', u'1', u'0', u'1', u'0', u'0', u'1']),
 (u'1788', [u'1', u'0', u'0', u'0', u'0', u'1', u'0', u'0', u'1'])]

In [124]:
# pass picture cluster to restaurant
res_cluster = photo_res_label.map(lambda x: (x[0],x[1])).leftOuterJoin(photo_cluster_map).map(lambda x: x[1])
res_cluster.take(2)

[(u'393', u'27'), (u'3241', u'19')]

In [125]:
# combine clusters by restaurant and 
# calculate ratio of each picture cluster under each restaurant as features
agg_res_cluster = res_cluster.combineByKey(lambda x: [x],
                                           lambda u, v: u+[v],
                                           lambda u1,u2: u1+u2)
agg_res_cluster = agg_res_cluster.mapValues(lambda x: Counter(x))\
                                .map(lambda x: (x[0], {int(k):float(v)/np.sum(x[1].values()) for k,v in x[1].iteritems()}))
agg_res_cluster.take(1)

[(u'692',
  {1: 0.06060606060606061,
   4: 0.030303030303030304,
   5: 0.10606060606060606,
   6: 0.06060606060606061,
   8: 0.030303030303030304,
   12: 0.030303030303030304,
   16: 0.015151515151515152,
   19: 0.13636363636363635,
   21: 0.030303030303030304,
   22: 0.030303030303030304,
   23: 0.015151515151515152,
   24: 0.015151515151515152,
   26: 0.030303030303030304,
   27: 0.015151515151515152,
   29: 0.015151515151515152,
   30: 0.015151515151515152,
   31: 0.07575757575757576,
   33: 0.06060606060606061,
   43: 0.045454545454545456,
   48: 0.18181818181818182})]

In [126]:
# merge cluster ratio feature and labels
res_cluster_label = agg_res_cluster.leftOuterJoin(res_label).map(lambda x: (x[0],x[1][0],x[1][1]))
res_cluster_label.take(1)

[(u'120',
  {1: 0.14545454545454545,
   4: 0.03636363636363636,
   5: 0.07272727272727272,
   8: 0.01818181818181818,
   9: 0.03636363636363636,
   12: 0.14545454545454545,
   14: 0.05454545454545454,
   21: 0.03636363636363636,
   22: 0.03636363636363636,
   24: 0.05454545454545454,
   26: 0.05454545454545454,
   27: 0.07272727272727272,
   29: 0.01818181818181818,
   30: 0.03636363636363636,
   33: 0.03636363636363636,
   37: 0.05454545454545454,
   39: 0.01818181818181818,
   43: 0.07272727272727272},
  [u'0', u'1', u'1', u'0', u'0', u'1', u'1', u'0', u'1'])]

In [127]:
res_cluster_label.count()

500

Process test data

In [128]:
# predict test picture clusters
test_image_cluster = test_feature_parsed.map(lambda x: (x[0],clusters.predict(x[1])))
test_image_cluster.take(2)

[(u'80', 8), (u'95', 6)]

In [129]:
# extract test res_id, labels
test_res_label = test_photo_res_label.map(lambda x: (x[1], x[2:])).combineByKey(lambda x: x,
                                           lambda u, v: u,
                                           lambda u1,u2: u1)
test_res_label.take(1)

[(u'881', [u'1', u'1', u'0', u'1', u'0', u'1', u'0', u'0', u'1'])]

In [130]:
# pass test picture cluster to restaurant
test_res_cluster = test_photo_res_label.map(lambda x: (x[0],x[1])).leftOuterJoin(test_image_cluster).map(lambda x: x[1])
test_res_cluster.take(3)

[(u'1160', 43), (u'485', 33), (u'1114', 1)]

In [131]:
# combine clusters by restaurant and 
# calculate ratio of each picture cluster under each restaurant as features
test_agg_res_cluster = test_res_cluster.combineByKey(lambda x: [x],
                                           lambda u, v: u+[v],
                                           lambda u1,u2: u1+u2)\
                                .mapValues(lambda x: Counter(x))\
                                .map(lambda x: (x[0], {int(k):float(v)/np.sum(x[1].values()) for k,v in x[1].iteritems()}))

In [132]:
# merge test cluster ratio feature and labels
test_res_cluster_label = test_agg_res_cluster.leftOuterJoin(test_res_label).map(lambda x: (x[0],x[1][0],x[1][1]))

Building models

In [133]:
prediction = pd.DataFrame(test_res_cluster_label.map(lambda x: x[0]).collect(),columns=['restaurant_id'])
for i in range(9):
    train = res_cluster_label.map(lambda x: LabeledPoint(int(x[2][i]), SparseVector(50, x[1]))).cache()
    test = test_res_cluster_label.map(lambda x: LabeledPoint(int(x[2][i]), SparseVector(50, x[1]))).cache()
    lrm = LogisticRegressionWithLBFGS.train(train, intercept=True, validateData=False)

    labelsAndPreds = zip(*test.map(lambda p: (lrm.predict(p.features), p.label)).collect())

    current_f1 = f1_score(y_true=labelsAndPreds[1], y_pred=labelsAndPreds[0])
    current_accuracy = accuracy_score(y_true=labelsAndPreds[1], y_pred=labelsAndPreds[0])
    prediction[('label_%d' % i)]=pd.Series(labelsAndPreds[0])
    
#     # if predict proba instead of labels
#     lrm.clearThreshold()
#     labelsAndProbs = zip(*test.map(lambda p: (lrm.predict(p.features), p.label)).collect())
#     current_auc = roc_auc_score(y_true=labelsAndProbs[1], y_score=labelsAndProbs[0])
    
    print i, current_accuracy, current_f1

0 0.717171717172 0.588235294118
1 0.79797979798 0.818181818182
2 0.79797979798 0.811320754717
3 0.59595959596 0.574468085106
4 0.787878787879 0.655737704918
5 0.79797979798 0.84375
6 0.777777777778 0.845070422535
7 0.79797979798 0.705882352941
8 0.757575757576 0.796610169492


In [134]:
# score function
def mlb_f1(y_true, y_pred):
    y_true = set(y_true)
    y_pred = set(y_pred)
    tp = len(y_true & y_pred)
    fp = len(y_pred-y_true)
    fn = len(y_true-y_pred)
    p = float(tp)/(tp+fp)
    r = float(tp)/(tp+fn)
    return 2*p*r/(p+r) if tp!=0 else 0.
def mean_f1(y_true, y_pred):
    score_list = map(lambda x: mlb_f1(x[0],x[1]), zip(y_true, y_pred))
    return sum(score_list)/len(score_list)

In [135]:
y_pred = [[i for i in range(len(arr)) if arr[i]!=0] for arr in np.array(prediction.iloc[:,1:])]
y_true_array = np.array(test_res_cluster_label.map(lambda x: x[2]).collect()).astype(int)
y_true = [[i for i in range(len(arr)) if arr[i]!=0] for arr in y_true_array]
mean_f1(y_true, y_pred)

0.7252071721768684

Baseline ratios

In [136]:
test_labels_array = np.array(test_res_cluster_label.map(lambda x: x[2]).collect())
test_labels_count = [Counter(arr) for arr in test_labels_array.T]
test_labels_freq = [{k:float(v)/np.sum(dict_.values()) for k,v in dict_.iteritems()} for dict_ in test_labels_count]

In [137]:
test_labels_freq

[{u'0': 0.6363636363636364, u'1': 0.36363636363636365},
 {u'0': 0.43434343434343436, u'1': 0.5656565656565656},
 {u'0': 0.45454545454545453, u'1': 0.5454545454545454},
 {u'0': 0.5151515151515151, u'1': 0.48484848484848486},
 {u'0': 0.6868686868686869, u'1': 0.31313131313131315},
 {u'0': 0.32323232323232326, u'1': 0.6767676767676768},
 {u'0': 0.25252525252525254, u'1': 0.7474747474747475},
 {u'0': 0.6161616161616161, u'1': 0.3838383838383838},
 {u'0': 0.42424242424242425, u'1': 0.5757575757575758}]

upload prediction to s3

In [138]:
# s3_file = "deep_cluster_prediction.csv"
# s3_path = "subset"
# file_name_to_use_in_s3 = "%s/%s"%(s3_path, s3_file)
# k.name = file_name_to_use_in_s3
# with tempfile.TemporaryFile() as tmpf:
#     prediction.to_csv(tmpf, index=False)
#     tmpf.seek(0)
#     k.set_contents_from_file(tmpf)