directly build logistic regression on image level with deep learning features

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
# from pyspark.mllib.feature import HashingTF, IDF
# from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from StringIO import StringIO
from PIL import Image
import numpy as np
import csv
import os, tempfile
import boto
import datetime

In [2]:
# AWS S3 credentials:

AWS_KEY = ""
AWS_SECRET = ""
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET)

In [23]:
# read training image deep learning features
features_train = sc.textFile('s3n://amlyelp/fc7features/train_image_fc7features/')

In [24]:
pid_features_train=features_train.map(lambda x: tuple(x.split('|'))).mapValues(lambda x: np.array(x.split(','),dtype=float))
pid_features_train.take(2)

[(u'11598', array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
          2.57181787,  0.        ])),
 (u'295391',
  array([ 0.       ,  0.       ,  0.       , ...,  1.5179913,  0.       ,  0.       ]))]

In [25]:
# map training image to label, restaurant
train_label = sc.textFile('s3n://amlyelp/pic_label_trainall.csv')
first_line = train_label.take(1)[0]
train_label = train_label.filter(lambda x: x!= first_line).map(lambda x: x.split(','))\
                            .map(lambda x: (x[0],{'restaurant':x[1],'labels':np.array(x[2:11],dtype=int)}))
train_label.take(1)

[(u'204149',
  {'labels': array([0, 0, 0, 1, 0, 0, 0, 0, 1]), 'restaurant': u'3034'})]

In [27]:
# join training image, label and features
id_label_feature_train = train_label.leftOuterJoin(pid_features_train)
id_label_feature_train.take(2)

[(u'378466',
  ({'labels': array([0, 0, 0, 1, 0, 0, 0, 0, 0]), 'restaurant': u'227'},
   array([ 0.,  0.,  0., ...,  0.,  0.,  0.]))),
 (u'35540',
  ({'labels': array([0, 1, 1, 0, 1, 1, 1, 1, 0]), 'restaurant': u'2611'},
   array([ 0.        ,  0.        ,  0.        , ...,  5.03832006,
           0.        ,  0.        ])))]

In [28]:
# read testing image features
features_test = sc.textFile('s3n://amlyelp/fc7features/test_image_fc7features/')
pid_features_test=features_test.map(lambda x: tuple(x.split('|'))).mapValues(lambda x: np.array(x.split(','),dtype=float))
pid_features_test.take(2)

[(u'306310', array([ 0.,  0.,  0., ...,  0.,  0.,  0.])),
 (u'414079',
  array([ 1.7991451,  0.       ,  0.       , ...,  0.       ,  0.       ,  0.       ]))]

In [1]:
# build logistic regression for 1 label
i=0
train = id_label_feature_train.map(lambda x: LabeledPoint(x[1][0]['labels'][i], x[1][1])).repartition(120).cache()
# train.take(2)

In [30]:
lrm = LogisticRegressionWithLBFGS.train(train, intercept=True, validateData=False)

In [33]:
lrm.clearThreshold()
prediction = pid_features_test.mapValues(lambda x: lrm.predict(x))
prediction.take(7)

[(u'306310', 0.1442900951215041),
 (u'414079', 0.7516735861272454),
 (u'235737', 0.04615771435067375),
 (u'6011', 0.0200634977117878),
 (u'349578', 0.42398592348472686),
 (u'42644', 0.041748559818473606),
 (u'358960', 0.11008557094520433)]

In [36]:
prediction.map(lambda x: ','.join(np.array(x, dtype=np.str)))\
.saveAsTextFile('s3n://amlyelp/fc7features_pred/prob.csv')

In [37]:
prediction_list = prediction.map(lambda x: ','.join(np.array(x, dtype=np.str))).collect()

In [39]:
tmpf = '\n'.join(prediction_list)

In [41]:
len(prediction_list)

237152

In [42]:
from boto.s3.connection import S3Connection
from boto.s3.key import Key

AWS_KEY = ""
AWS_SECRET = ""

conn = S3Connection(AWS_KEY, AWS_SECRET, host='s3.amazonaws.com')
pb = conn.get_bucket('amlyelp')

k = Key(pb)

In [43]:
# s3_file = "image_cluster_%s.csv" % str(datetime.datetime.now()).replace(' ', '_').replace('.','_')
# s3_path = "subset/image_cluster_result"
# file_name_to_use_in_s3 = "%s/%s"%(s3_path, s3_file)
# file_name_to_use_in_s3 = 'fc7features_pred/prob_trial.csv'
# k.name = file_name_to_use_in_s3
# k.set_contents_from_string(tmpf)

5211857

In [44]:
# train and predict for each label and write to S3
for i in range(9):
    train = id_label_feature_train.map(lambda x: LabeledPoint(x[1][0]['labels'][i], x[1][1])).repartition(120).cache()

    lrm = LogisticRegressionWithLBFGS.train(train, intercept=True, validateData=False)

    lrm.clearThreshold()
    prediction = pid_features_test.mapValues(lambda x: lrm.predict(x))
    prediction_list = prediction.map(lambda x: ','.join(np.array(x, dtype=np.str))).collect()
    tmpf = '\n'.join(prediction_list)
    del prediction_list
    
    file_name_to_use_in_s3 = 'fc7features_pred/prob_%d.csv' % i
    k.name = file_name_to_use_in_s3
    k.set_contents_from_string(tmpf)
    del tmpf
    del train
    del lrm