In [20]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

import initialize

In [21]:
train = pd.read_json('./train.json')
test = pd.read_json('./test.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [22]:
dict_cuisine = dict()
ind = 0
for cuisine in train.loc[:, 'cuisine']:
    if not (cuisine in dict_cuisine.keys()):
        dict_cuisine[cuisine] = ind
        ind += 1

In [23]:
%%time
new_df = train.loc[initialize.create_subData(9000), :]

new_df['ingredients'] = new_df['ingredients'].apply(
    lambda ingredients: initialize.hash_data(ingredients))

new_df['ingredients'] = new_df['ingredients'].apply(lambda x: ' '.join(x))
new_df['cuisine'] = new_df['cuisine'].apply(lambda cuisine: dict_cuisine[cuisine])

cuisine_train, cuisine_test, y_train, y_test = train_test_split(new_df['ingredients'], new_df['cuisine'], test_size=0.1)

vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
df_train = vect.fit_transform(cuisine_train)

df_test  = vect.transform(cuisine_test)

print('shape train: ', np.shape(df_train))

shape train:  (35796, 6547)
CPU times: user 2.75 s, sys: 24 ms, total: 2.77 s
Wall time: 2.77 s


In [24]:
%%time
clf = LogisticRegression(C=10, penalty='l2', dual=False)
clf.fit(df_train, y_train)
print(metrics.classification_report(y_test, clf.predict(df_test)))

             precision    recall  f1-score   support

          0       0.78      0.62      0.69       132
          1       0.68      0.75      0.71       411
          2       0.75      0.62      0.68        79
          3       0.89      0.93      0.91       313
          4       0.78      0.68      0.73        53
          5       0.61      0.41      0.49       110
          6       0.82      0.90      0.86       815
          7       0.86      0.94      0.90       588
          8       0.76      0.85      0.80       248
          9       0.66      0.49      0.56        85
         10       0.79      0.78      0.78       157
         11       0.71      0.57      0.63        77
         12       0.84      0.74      0.79       192
         13       0.79      0.46      0.58        41
         14       0.63      0.60      0.61       283
         15       0.86      0.75      0.80       131
         16       0.63      0.52      0.57        65
         17       0.84      0.77      0.80   

In [25]:
# %%time
# svmс = LinearSVC(C=0.9, penalty='l1', dual=False)
# svmс.fit(df_train, y_train)
# print(metrics.classification_report(y_test, svmс.predict(df_test)))

In [27]:
%%time
svmс = SVC(C=100, # penalty parameter
	 			 kernel='rbf', # kernel type, rbf working fine here
	 			 degree=3, # default value
	 			 gamma=1, # kernel coefficient
	 			 coef0=1, # change to 1 from default value of 0.0
	 			 shrinking=True, # using shrinking heuristics
	 			 tol=0.001, # stopping criterion tolerance 
	      		 probability=True, 
	      		 cache_size=200, # 200 MB cache size
	      		 class_weight=None, # all classes are treated equally 
	      		 verbose=False, # print the logs 
	      		 max_iter=-1, # no limit, let it run
          		 decision_function_shape=None, # will use one vs rest explicitly 
          		 random_state=None)
svmс.fit(df_train, y_train)
print(metrics.classification_report(y_test, svmс.predict(df_test)))

             precision    recall  f1-score   support

          0       0.81      0.62      0.70       132
          1       0.67      0.81      0.73       411
          2       0.80      0.61      0.69        79
          3       0.91      0.92      0.91       313
          4       0.90      0.68      0.77        53
          5       0.66      0.43      0.52       110
          6       0.80      0.91      0.85       815
          7       0.88      0.93      0.91       588
          8       0.79      0.87      0.83       248
          9       0.77      0.40      0.53        85
         10       0.81      0.80      0.81       157
         11       0.75      0.58      0.66        77
         12       0.82      0.72      0.77       192
         13       0.80      0.39      0.52        41
         14       0.63      0.64      0.64       283
         15       0.89      0.73      0.80       131
         16       0.71      0.54      0.61        65
         17       0.87      0.78      0.82   

In [28]:
from sklearn.externals import joblib
joblib.dump(svmс, 'svmс1.pkl') 
joblib.dump(clf, 'clf1.pkl') 
joblib.dump(vect, 'vect1.pkl') 
# svmc = joblib.load('svmс1.pkl')
# clf = joblib.load('clf1.pkl') 

['vect1.pkl']

In [29]:
mean_proba = (0.2*clf.predict_proba(df_test) + 0.8*svmс.predict_proba(df_test))
pred_label = []
for pred_proba in mean_proba:
    pred_label.append(pred_proba.argmax(axis=0))
print(metrics.classification_report(y_test, pred_label))

             precision    recall  f1-score   support

          0       0.77      0.69      0.73       132
          1       0.72      0.79      0.75       411
          2       0.76      0.67      0.71        79
          3       0.91      0.91      0.91       313
          4       0.82      0.75      0.78        53
          5       0.61      0.48      0.54       110
          6       0.83      0.89      0.86       815
          7       0.90      0.93      0.92       588
          8       0.78      0.85      0.81       248
          9       0.71      0.53      0.61        85
         10       0.79      0.80      0.79       157
         11       0.69      0.62      0.65        77
         12       0.81      0.75      0.78       192
         13       0.78      0.61      0.68        41
         14       0.65      0.65      0.65       283
         15       0.87      0.76      0.81       131
         16       0.64      0.54      0.58        65
         17       0.85      0.81      0.83   