In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, decomposition, svm, pipeline, metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder


train=pd.read_json('../input/train.json' )
test=pd.read_json('../input/test.json')

train['ingredients'] = [", ".join(ingredients) for ingredients in train['ingredients']]
test['ingredients']=[", ".join(ingredients) for ingredients in test['ingredients']]

#To find logloss
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota


#Use label encoders to convert text labels into integers

lblencdr=preprocessing.LabelEncoder()
y_cuisine=lblencdr.fit_transform(train['cuisine'].values)
#stratify option in train_test_split will ensure that the data is taken in the same proportion as the original dataset. This is particularly helpful in unbalanced datasets
X_train, X_val, y_train, y_val = train_test_split(train['ingredients'].values, y_cuisine, 
                                                  stratify=y_cuisine,
                                                  test_size=0.1,
                                                  shuffle=True,
                                                  random_state=0)

vect=TfidfVectorizer().fit(list(X_train)+list(X_val))
X_train_vect=vect.transform(X_train)
x_valid_vect=vect.transform(X_val)

#Grid Search Technique. Create a scorer function by using make_scorer
mll_scorer=metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

#Create a pipeline
svd=TruncatedSVD()

#Initialize the standard scalar
scl=preprocessing.StandardScaler()

#logistic Regression
lr=LogisticRegression()

clf=pipeline.Pipeline([('svd', svd),
                      ('scl', scl),
                      ('lr', lr)]
                     )

#create a grid of parameters

param_grid={'svd__n_components': [120, 180],
            'lr__C': [0.1, 1.0, 10],
             'lr__penalty': ['l1', 'l2']}

model=GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                  verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

model.fit(X_train_vect, y_train)

print("Best score: %0.3f" % model.best_score_)
print("Best Parameters set:")
best_parameters=model.best_estimator_.get_params()

for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

predictions=model.predict(vect.transform(X_val))
print(f1_score(predictions, y_val, average='micro'))
y_predict=model.predict(vect.transform(test['ingredients']))

test['cuisine']=lblencdr.inverse_transform(y_predict)
test = test.sort_values('id' , ascending=True)

test[['id' , 'cuisine' ]].to_csv("submission.csv", index=False)