In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

    

In [35]:
train =  pd.read_csv("train.csv", index_col=None)
#train =  pd.read_csv("train_wo_ZeroLabels.csv", index_col=None)

In [36]:
len(train)

229907

In [37]:
train = train[['text','categories']]


In [38]:

#subset data to test model
train = train.sample(frac=0.1)
len(train)

22991

In [39]:
X_train = np.array(train['text'].values.astype('U'))
y_train =  np.array(train['categories'])

In [40]:
classifier = Pipeline([
    #Convert a collection of text documents to a matrix of token counts
    ('vectorizer', CountVectorizer(ngram_range=(1,1))),
    #TFIDF
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])


In [41]:
classifier.fit(X_train,y_train )



Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [42]:
#Save model
import pickle
pickle.dump(classifier, open('OneVsRestClassifier_10%subset', 'wb'))

In [43]:
# load the model from disk
#classifier = pickle.load(open('OneVsRestClassifier_10%subset', 'rb'))


In [51]:
test =  pd.read_csv("test.csv", index_col=None)
#test =  pd.read_csv("test_wo_ZeroLabels.csv", index_col=None)
print(len(test))
test = test[['text','categories']]
#subset data to test model
#test = test.sample(frac=0.1)
print(len(test))

X_test = np.array(test['text'].values.astype('U'))
y_test =  np.array(test['categories'])


10702
10702


In [52]:
predicted = classifier.predict(X_test)



In [53]:
predicted

array(["['Burgers', 'Bars', 'American (New)', 'Nightlife', 'Restaurants']",
       "['Fast Food', 'Restaurants']",
       "['Burgers', 'Fast Food', 'Restaurants']", ...,
       "['Breakfast & Brunch', 'Restaurants']",
       "['Breakfast & Brunch', 'Restaurants']",
       "['American (Traditional)', 'Restaurants']"],
      dtype='<U180')

In [54]:
y_test

array(["['Burgers', 'Fast Food', 'Restaurants']",
       "['Burgers', 'Fast Food', 'Restaurants']",
       "['Burgers', 'Fast Food', 'Restaurants']", ...,
       "['Breakfast & Brunch', 'American (Traditional)', 'Restaurants']",
       "['Breakfast & Brunch', 'American (Traditional)', 'Restaurants']",
       "['Breakfast & Brunch', 'American (Traditional)', 'Restaurants']"], dtype=object)

In [55]:
X_test

array([ "I try not to eat this type of food because I don't want to be so unhealthy but sometimes I just don't have time and I have to grab something quick. I always had issues with this Mcdonald's and the others around the area that are also under the same management, you know the usual from Fast food restaurants, they give you the wrong orders at the drive-thru, they forget ketchup, rude but the biggest problem I had here was a few months ago, they were trying to charge me for extra ketchup, I was where is there a sign where it says that I have to pay extra for a package of ketchup? after that they made a rude comment and gave me the ketchup, I complained directly to management and a few days later they put signs in their locations that say they will charge for your condiments.   I don't know if all Mcdonald's are like that but the ones own by the owner of the ones in the area are so cheap, when did I have to pay extra just because I wanted 3 packs of ketchup for a meal of 6 bucks? t

In [56]:
#Micro has implications -- better to predict and hit occasionally than to never predict at all
f1 = f1_score(y_test,predicted, average ='micro')
recall =  recall_score(y_test, predicted, average= 'micro')
precission =precision_score(y_test, predicted, average='micro')

In [57]:
print("f1: {}".format(f1))
print("precission: {}".format(precission))
print("recall: {}".format(recall))

f1: 0.2876097925621379
precission: 0.2876097925621379
recall: 0.2876097925621379
