# Import the required libraries 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import unidecode
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import json
import re
import numpy as np
from collections import defaultdict
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from tqdm import tqdm
from scipy.stats import uniform
tqdm.pandas()

# Dataset Preparation

In [2]:
print ("Read Dataset ... ")
def read_dataset(path):
    return json.load(open(path)) 

train1 = read_dataset('C:/Users/Tim/Desktop/tensorflow/bouffekaggle/all/train.json')
test1 = read_dataset('C:/Users/Tim/Desktop/tensorflow/bouffekaggle/all/test.json')

train = pd.read_json('C:/Users/Tim/Desktop/tensorflow/bouffekaggle/all/train.json')
test = pd.read_json('C:/Users/Tim/Desktop/tensorflow/bouffekaggle/all/test.json')

Read Dataset ... 


# Text Data Features

In [3]:
print ("Prepare text data of Train and Test ... ")
lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients_text = ' '.join(ingredients)
    ingredients_text = ingredients_text.lower()
    ingredients_text = ingredients_text.replace('-', ' ')
    words = []
    for word in ingredients_text.split():
        if re.findall('[0-9]', word): continue
        if len(word) <= 2: continue
        if '’' in word: continue
        word = lemmatizer.lemmatize(word)
        if len(word) > 0: words.append(word)
    return ' '.join(words)
    
    
train['x'] = train['ingredients'].progress_apply(lambda ingredients: preprocess(ingredients))
test['x'] = test['ingredients'].progress_apply(lambda ingredients: preprocess(ingredients))

def generate_text(data):
    text_data = [" ".join(doc['ingredients']).lower() for doc in data]
    return text_data 

train_text = train['x']
test_text = test['x']

target = [doc['cuisine'] for doc in train1]

Prepare text data of Train and Test ... 


100%|██████████| 39774/39774 [00:06<00:00, 6399.81it/s]
100%|██████████| 9944/9944 [00:01<00:00, 8815.76it/s]


# Feature Engineering 

In [4]:
print ("TF-IDF on text data ... ")
tfidf = TfidfVectorizer(binary=True)
def tfidf_features(txt, flag):
    if flag == "train":
        x = tfidf.fit_transform(txt)
    else:
        x = tfidf.transform(txt)
    x = x.astype('float16')
    return x 
X = tfidf_features(train_text, flag="train")
X_test = tfidf_features(test_text, flag="test")

TF-IDF on text data ... 


# Label Encoding - Target 

In [5]:
print ("Label Encode the Target Variable ... ")
lb = LabelEncoder()
y = lb.fit_transform(target)

Label Encode the Target Variable ... 


# Model Training 

In [6]:
classifier = SVC(C=10, # penalty parameter
                 kernel='rbf', # kernel type, rbf working fine here
                 degree=3, # default value
                 gamma=1, # kernel coefficient
                 coef0=1, # change to 1 from default value of 0.0
                 shrinking=True, # using shrinking heuristics
                 tol=0.001, # stopping criterion tolerance 
                 probability=False, # no need to enable probability estimates
                 cache_size=200, # 200 MB cache size
                 class_weight=None, # all classes are treated equally 
                 verbose=False, # print the logs 
                 max_iter=-1, # no limit, let it run
                 decision_function_shape=None, # will use one vs rest explicitly 
                 random_state=None)
model = OneVsRestClassifier(classifier, n_jobs=1)

# Model Tuning 

In [None]:
parameters = {"estimator__C":[1, 10, 50, 100, 200, 500 ]}
grid_search = GridSearchCV(model, param_grid=parameters)

In [None]:
%%time
grid_search.fit(X, y)

In [7]:
parameters = {"estimator__C":uniform(5,100),"estimator__gamma":uniform(.5,1.5)}
rdgrid_search = RandomizedSearchCV(model, parameters,n_iter=20, n_jobs=-1)

In [8]:
%%time
rdgrid_search.fit(X,y)

Wall time: 9h 51min 49s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=OneVsRestClassifier(estimator=SVC(C=10, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'estimator__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D403DEC780>, 'estimator__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D403DF8518>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [9]:
pd.DataFrame(rdgrid_search.cv_results_)[['mean_test_score', 'std_test_score', 'params']]



Unnamed: 0,mean_test_score,std_test_score,params
0,0.812893,0.001436,"{'estimator__C': 87.8869082415457, 'estimator_..."
1,0.812365,0.001307,"{'estimator__C': 79.63970410647498, 'estimator..."
2,0.812365,0.001671,"{'estimator__C': 14.795878848663708, 'estimato..."
3,0.810152,0.001361,"{'estimator__C': 30.229530398020948, 'estimato..."
4,0.806557,0.002161,"{'estimator__C': 68.61484395427657, 'estimator..."
5,0.812717,0.00157,"{'estimator__C': 54.20638263758478, 'estimator..."
6,0.810152,0.002236,"{'estimator__C': 65.67823192676748, 'estimator..."
7,0.813044,0.001457,"{'estimator__C': 15.380734308876926, 'estimato..."
8,0.813069,0.001762,"{'estimator__C': 32.2730405438815, 'estimator_..."
9,0.812893,0.001743,"{'estimator__C': 40.92117750287818, 'estimator..."


In [11]:
print(rdgrid_search.best_score_)
print(rdgrid_search.best_params_)

0.813068838940011
{'estimator__C': 32.2730405438815, 'estimator__gamma': 1.630826177467243}


In [None]:
df=pd.DataFrame(grid_search.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
print(grid_search.best_score_)
print(grid_search.best_params_)
df.head(10)

In [None]:
print(grid_search.cv_results_['params'][0])
print(grid_search.cv_results_['mean_test_score'][0])

In [None]:
grid_mean_scores = grid_search.cv_results_['mean_test_score']
plt.plot([1, 10, 50, 100, 200, 500 ], grid_mean_scores)
plt.xlabel('Value of C')
plt.ylabel('Cross-Validated Accuracy')

In [None]:
%%time
model.fit(X,y)

# Predictions 

In [None]:
%%time
print ("Predict on test data ... ")
y_test = model.predict(X_test)
y_pred = lb.inverse_transform(y_test)

# Submission

In [None]:
print ("Generate Submission File ... ")
test_id = [doc['id'] for doc in test]
sub = pd.DataFrame({'id': test_id, 'cuisine': y_pred}, columns=['id', 'cuisine'])
sub.to_csv('svm_output.csv', index=False)