# Step 3 OneVsRest Classifier

In [62]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, hamming_loss,jaccard_similarity_score

    

In [35]:
train =  pd.read_csv("train.csv", index_col=None)
#train =  pd.read_csv("train_wo_ZeroLabels.csv", index_col=None)

In [36]:
len(train)

229907

In [37]:
train = train[['text','categories']]


In [38]:

#subset data to 10%
train = train.sample(frac=0.1)
len(train)

22991

In [39]:
X_train = np.array(train['text'].values.astype('U'))
y_train =  np.array(train['categories'])

In [40]:
classifier = Pipeline([
    #Convert a collection of text documents to a matrix of token counts
    ('vectorizer', CountVectorizer(ngram_range=(1,1))),
    #TFIDF
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])


In [41]:
classifier.fit(X_train,y_train )



Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [42]:
#Save model
import pickle
pickle.dump(classifier, open('OneVsRestClassifier_10%subset', 'wb'))

In [43]:
# load the model from disk
#classifier = pickle.load(open('OneVsRestClassifier_10%subset', 'rb'))


In [74]:
test =  pd.read_csv("test.csv", index_col=None)
#test =  pd.read_csv("test_wo_ZeroLabels.csv", index_col=None)
print(len(test))
test = test[['text','categories']]
#subset data to test model
test = test.sample(frac=0.1)
print(len(test))

X_test = np.array(test['text'].values.astype('U'))
y_test =  np.array(test['categories'])


10702
1070


In [75]:
predicted = classifier.predict(X_test)



In [76]:
predicted

array(["['Vietnamese', 'Restaurants']", "['Mexican', 'Restaurants']",
       "['Steakhouses', 'Restaurants']", ...,
       "['Auto Repair', 'Automotive']",
       "['Buffets', 'Chinese', 'Restaurants']",
       "['Arts & Entertainment', 'Art Galleries', 'Shopping']"],
      dtype='<U180')

In [77]:
y_test

array(["['Bars', 'Vietnamese', 'Nightlife', 'Restaurants']",
       "['Mexican', 'Restaurants']", "['Steakhouses', 'Restaurants']", ...,
       "['Auto Repair', 'Automotive']", "['Buffets', 'Restaurants']",
       "['Professional Services', 'Videographers', 'Video/Film Production', 'Event Planning & Services']"], dtype=object)

In [78]:
X_test

array([ "We tried to go to a Pho place that was recommended to us by a friend but was closed. Did a quick yelp search and found this place that had just opened. \n\nIt was pretty empty at 1pm with only 4 tables occupied. Service was good except the kitchen messed up on our orders which is hard to believe as, with most Pho places, it's order by number. No biggie, the fixed it. \n\nOn to the Pho itself. First the size. The regular and large are not much different and the large is smaller then most place I've gone. The broth itself was OK but not anything special. The meat was lacking, not in quality but quantity. Overall it was a decent bowl of Pho but not something I would drive to in order to have. If I was in the area and needed a quick Pho fix, sure, but nothing more.",
       "Went to Tacos Jaliscos today and ordered Pozole. Was told they were out of it. So I ordered the tacos at $1 each. I ordered 2 de pastor and 2 of lengua. I must say that they were not fresh at all and they did 

In [79]:
#Micro has implications -- better to predict and hit occasionally than to never predict at all
f1 = f1_score(y_test,predicted, average ='micro')
hamming = hamming_loss(y_test, predicted)
jaccard = jaccard_similarity_score(y_test, predicted)

In [80]:
print("f1: {}".format(f1))
print("hamming_loss: {}".format(hamming))
print("jaccard_similarity_score: {}".format(jaccard))



f1: 0.27102803738317754
hamming_loss: 0.7289719626168224
jaccard_similarity_score: 0.27102803738317754
