In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

import initialize

In [2]:
train = pd.read_json('./train.json')
test = pd.read_json('./test.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics



In [4]:
dict_cuisine = dict()
ind = 0
for cuisine in train.loc[:, 'cuisine']:
    if not (cuisine in dict_cuisine.keys()):
        dict_cuisine[cuisine] = ind
        ind += 1

In [355]:
%%time
data = initialize.create_subParams(train, 1000, 80, 1200)
data = data.drop(['id'], axis=1)
data['ingredients'] = data['ingredients'].apply(lambda d: len(d))
data['cuisine'] = data['cuisine'].apply(lambda cuisine: dict_cuisine[cuisine])

cuisine_train, cuisine_test, y_train, y_test = train_test_split(data.drop(['cuisine'], axis=1), data.loc[:, 'cuisine'])

clf = LogisticRegression(C=2)
clf.fit(cuisine_train, y_train)
print(metrics.accuracy_score(y_test, clf.predict(cuisine_test)))

0.587564042850489
CPU times: user 5min 17s, sys: 901 ms, total: 5min 18s
Wall time: 5min 18s


In [27]:
%%time
new_df = train.loc[initialize.create_subData(1000), :]
new_df['ingredients'] = new_df['ingredients'].apply(lambda x: ' '.join(x))
new_df['cuisine'] = new_df['cuisine'].apply(lambda cuisine: dict_cuisine[cuisine])

cuisine_train, cuisine_test, y_train, y_test = train_test_split(new_df['ingredients'], new_df['cuisine'])

vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
df_train = vect.fit_transform(cuisine_train)
df_test  = vect.transform(cuisine_test)

clf = LogisticRegression(C=2, penalty='l1')
clf.fit(df_train, y_train)
print(metrics.accuracy_score(y_test, clf.predict(df_test)))

0.7172799254774104
CPU times: user 4.75 s, sys: 53 Âµs, total: 4.75 s
Wall time: 4.75 s


In [28]:
print(np.sum(np.abs(clf.coef_) > 1e-4))

4970


In [11]:
%%time
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_components=400)
X_train_pca = tsvd.fit_transform(df_train)
X_test_pca = tsvd.transform(df_test)

clf = LogisticRegression(C=0.5)
clf.fit(X_train_pca, y_train)
print(metrics.accuracy_score(y_test, clf.predict(X_test_pca)))

0.6832789939450395
CPU times: user 25.9 s, sys: 1.01 s, total: 26.9 s
Wall time: 23.2 s


In [31]:
print(np.shape(df_train[:, rlg.scores_ > 0]))

(12879, 34)


In [35]:
%%time
from sklearn.linear_model import RandomizedLogisticRegression
rlg = RandomizedLogisticRegression(C=0.55)
rlg.fit(df_train, y_train)
print(np.shape(df_train[:, rlg.scores_ > 0]))



(12879, 525)
CPU times: user 1min 55s, sys: 104 ms, total: 1min 55s
Wall time: 1min 55s


In [36]:
X_train_lasso = df_train[:, rlg.scores_ > 0]
X_test_lasso = df_test[:, rlg.scores_ > 0]

clf = LogisticRegression(C=1)
clf.fit(X_train_lasso, y_train)
print(metrics.accuracy_score(y_test, clf.predict(X_test_lasso)))

0.6797857475547275
