# PART3: Multilabel-Classification using scikit-learn SVM

## Load Data

In [176]:
import numpy as np
import pandas as pd 
import time
import os

In [181]:
def get_biz_features(biz_id):
    path='../dataset/business_features/'+str(biz_id)+'.features'
    if(not os.path.isfile(path)):
        return False

    fh=open(path)
    contents = [line.rstrip() for line in fh.readlines()]
    fh.close()
    return [round(float(contents[i]),9) for i in range(len(contents))]


In [178]:
train_labels = pd.read_csv('../dataset/train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()
print ("Number of business: ", len(biz_ids) ,   "(4 business with missing labels are dropped)")

Number of business:  1996 (4 business with missing labels are dropped)


In [199]:
t= time.time()
## For each business, compute a feature vector 
df = pd.DataFrame(columns=['business','label','feature vector'])
index = 0
for biz in biz_ids:
    biz=int(biz)
    mean_feature=get_biz_features(biz)

    if(mean_feature):
        label = train_labels.loc[biz]['labels']
        mean_feature=get_biz_features(biz)
        df.loc[index] = [biz, label, mean_feature]
        index+=1
        if index%100==0:
            print ("Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

with open("../dataset/train_biz_fc7features.csv",'w') as f:  
    df.to_csv(f, index=False)

Buisness processed:  100 Time passed:  1.7 sec
Buisness processed:  200 Time passed:  3.2 sec
Buisness processed:  300 Time passed:  4.9 sec
Buisness processed:  400 Time passed:  6.6 sec
Buisness processed:  500 Time passed:  8.1 sec


In [205]:
# Check file content
train_business = pd.read_csv('../dataset/train_biz_fc7features.csv')
print (train_business.shape)
train_business[0:5]


(583, 3)


Unnamed: 0,business,label,feature vector
0,1001.0,"(0, 1, 6, 8)","[0.254280001, 0.311022013, 0.082138397, 0.2171..."
1,101.0,"(1, 2, 3, 4, 5, 6)","[0.356665164, 0.289317936, 0.308460176, 0.2959..."
2,1012.0,"(1, 2, 3, 5, 6)","[0.258892, 0.353178799, 0.575188816, 0.4061788..."
3,1015.0,"(1, 5, 6, 7)","[0.466114551, 0.394962043, 0.558969557, 0.1906..."
4,1022.0,"(0, 2, 3, 5, 8)","[0.419218779, 0.315308422, 0.484456778, 0.4717..."


In [206]:
train_df = pd.read_csv("../dataset/train_biz_fc7features.csv")
# test_df  = pd.read_csv(data_root+"test_biz_fc7features.csv")

y_train = train_df['label'].values
X_train = train_df['feature vector'].values
# X_test = test_df['feature vector'].values

def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x)>0]

def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [ float(x) for x in str_feature]


y_train = np.array([convert_label_to_array(y) for y in train_df['label']])
X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']])
# X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])


In [207]:
print ("X_train: ", X_train.shape)
print ("y_train: ", y_train.shape)
# print ("X_test: ", X_test.shape)
print ("train_df:")
train_df[0:5]


X_train:  (583, 2048)
y_train:  (583,)
train_df:


Unnamed: 0,business,label,feature vector
0,1001.0,"(0, 1, 6, 8)","[0.254280001, 0.311022013, 0.082138397, 0.2171..."
1,101.0,"(1, 2, 3, 4, 5, 6)","[0.356665164, 0.289317936, 0.308460176, 0.2959..."
2,1012.0,"(1, 2, 3, 5, 6)","[0.258892, 0.353178799, 0.575188816, 0.4061788..."
3,1015.0,"(1, 5, 6, 7)","[0.466114551, 0.394962043, 0.558969557, 0.1906..."
4,1022.0,"(0, 2, 3, 5, 8)","[0.419218779, 0.315308422, 0.484456778, 0.4717..."


## Train a SVM using 80% training data, and assess performance(F1-score)

In [208]:
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import time
t=time.time()

In [210]:
mlb = MultiLabelBinarizer()
y_ptrain= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2,random_state=random_state)
np.nan_to_num(X_ptrain)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(X_ptrain, y_ptrain)

y_ppredict = classifier.predict(X_ptest)

print ("Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

Time passed:  44.6 sec


In [211]:
print ("Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3])
print ("\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3]))

Samples of predicted labels (in binary matrix):
 [[0 1 1 1 0 1 1 0 0]
 [0 1 1 1 0 1 1 1 1]
 [0 1 1 0 1 1 1 0 0]]

Samples of predicted labels:
 [(1, 2, 3, 5, 6), (1, 2, 3, 5, 6, 7, 8), (1, 2, 4, 5, 6)]


In [212]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,39,66,67,52,46,87,89,39,66,117
biz ratio,33%,56%,57%,44%,39%,74%,76%,33%,56%,100%


In [213]:
from sklearn.metrics import f1_score

print ("F1 score: ", f1_score(y_ptest, y_ppredict, average='micro') )
print ("Individual Class F1 score: ", f1_score(y_ptest, y_ppredict, average=None))

F1 score:  0.767200754006
Individual Class F1 score:  [ 0.61111111  0.82608696  0.79069767  0.43298969  0.69333333  0.87804878
  0.9039548   0.57142857  0.84848485]


In [218]:
print("P test",y_ptest.shape)
print("y_ppredict ",y_ppredict.shape)
print(X_ptest[0])
print(y_ptest[0])
print(y_ppredict[0])

P test (117, 9)
y_ppredict  (117, 9)
[ 0.27695349  0.39839849  0.33369449 ...,  0.96580249  0.47117549
  0.33008051]
[0 1 1 0 0 1 1 0 1]
[0 1 1 1 0 1 1 0 0]


## Re-Train a SVM using all training data, and make predictions on test set

In [8]:
## Uncomment if skip previous train
#from sklearn import svm
#from sklearn.preprocessing import label_binarize
#from sklearn.multiclass import OneVsRestClassifier
#from sklearn.preprocessing import MultiLabelBinarizer
#import time

t = time.time()

mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)

#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels

print ("Time passed: ", "{0:.1f}".format(time.time()-t), "sec")

Time passed:  852.3 sec


In [9]:
X_test.shape

(10000, 4096)

In [10]:
test_data_frame  = pd.read_csv(data_root+"test_biz_fc7features.csv")
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root+"submission_fc7.csv",'w') as f:
    df.to_csv(f, index=False)    
    

In [11]:
statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_predict, axis=0), len(y_predict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_predict) 
statistics

Unnamed: 0,attribuite 0,attribuite 1,attribuite 2,attribuite 3,attribuite 4,attribuite 5,attribuite 6,attribuite 7,attribuite 8,num_biz
biz count,1408,6211,6991,5238,2034,8293,9043,1635,6104,10000
biz ratio,14%,62%,70%,52%,20%,83%,90%,16%,61%,100%


In [None]:
#LB score: 0.76437 (use fc7 layer)
#LB score: 0.73053 (use prob layer)