In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (cross_val_score, GridSearchCV,
                                     train_test_split, KFold)
kf = KFold(n_splits=4, shuffle=True)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [35]:
data = pd.read_json('train.json')

xtrain, xval, ytrain, yval = train_test_split(data['ingredients'].apply(', '.join), data.cuisine, test_size=0.2)

In [36]:
from sklearn.dummy import DummyClassifier

dum = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', DummyClassifier())])

cv = cross_val_score(dum, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Mean score: 0.10273739700954909
Std Dev:    0.0031153973588285773


In [37]:
pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

param_grid = [
    {
        'tfidf__max_df': [.1],#np.arange(.01,.10,.01),
        'tfidf__min_df': [2,3,4],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l1', 'l2'],
        'classify__alpha': [.01, .1, .2],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.1, max_features=None, min_df=2,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ...       vocabulary=None)), ('classify', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])
Mean score: 0.7387094666108881
Std Dev:    0.002910068556502416


In [38]:
from xgboost import XGBClassifier

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', XGBClassifier())
])

param_grid = [
    {
        'tfidf__max_df': [.1],#np.arange(.01,.10,.01),
        'tfidf__min_df': [2],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l1', 'l2'],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.1, max_features=None, min_df=2,
        ngram_range=(1, 2), norm='l1', preprocessor=None, smooth_idf=True,
 ...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])


  if diff:
  if diff:
  if diff:


Mean score: 0.7178729730836029
Std Dev:    0.0027498731436325485


  if diff:


In [25]:
df = pd.DataFrame()

for ix, lst in enumerate(data['ingredients']):
    df.at[ix, 'cuisine'] = data.iloc[ix]['cuisine']
    for ingred in lst:
        df.at[ix, ingred] = 1
        
df = df.fillna(0)
x = df.drop('cuisine', axis=1)
y = df['cuisine']

KeyboardInterrupt: 

In [30]:
pl = Pipeline([
    ('classify', MultinomialNB())
])

param_grid = [
    {
    },
]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(x, y)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, x, y, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('classify', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
Mean score: 0.5674882629107982
Std Dev:    0.028815689430025618


In [34]:
from sklearn.linear_model import LogisticRegression
pl = Pipeline([
    ('classify', LogisticRegression())
])

param_grid = [
    {
        'classify__penalty': ['l1', 'l2'],
        'classify__C': [10, 100, 1000],
    },
]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(x, y)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, x, y, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('classify', LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
Mean score: 0.5927230046948357
Std Dev:    0.01179562866328743


In [29]:
pl = Pipeline([
    ('classify', MultinomialNB())
])

param_grid = [
    {
    },
]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

KeyboardInterrupt: 

In [6]:
pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

param_grid = [
    {
        'tfidf__max_df': np.arange(.01,.10,.01),
        'tfidf__min_df': [2,3,4],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l1', 'l2'],
        'classify__alpha': [.01, .1, .2],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', LogisticRegression())
])

param_grid = [
    {
        'tfidf__max_df': [.04],#np.arange(.02,.8,.01),
        'tfidf__min_df': [1],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l2'],
        'classify__penalty': ['l2'],
        'classify__C': [100000]
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_lr = grid.best_estimator_
print(model_lr)
cv = cross_val_score(model_lr, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

In [None]:
from sklearn.linear_model import SGDClassifier

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', SGDClassifier())
])

param_grid = [
    {
        'tfidf__max_df': [.04],#np.arange(.02,.8,.01),
        'tfidf__min_df': [1],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l2'],
        'classify__loss': ['hinge', 'log', 'modified_huber'],
        'classify__penalty': ['l1', 'l2', 'elasticnet'],
        'classify__l1_ratio': np.arange(.1, 1.1, .1)
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_lr = grid.best_estimator_
print(model_lr)
cv = cross_val_score(model_lr, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

In [150]:
model_lr.named_steps['classify']

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.90000000000000002,
       learning_rate='optimal', loss='modified_huber', max_iter=5,
       n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [None]:
from xgboost import XGBClassifier

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', XGBClassifier())
])

param_grid = [
    {
        'tfidf__max_df': np.arange(.01,.10,.01),
        'tfidf__min_df': [2,3,4],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l1', 'l2'],
        'classify__max_depth': [2,3,4],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_xg = grid.best_estimator_
print(model_xg)
cv = cross_val_score(model_xg, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

# Improving data quality

In [234]:
pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

param_grid = [
    {
        'tfidf__max_df': np.arange(.01,.10,.01),
        'tfidf__min_df': [2,3,4],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l1', 'l2'],
        'classify__alpha': [.1],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(xtrain, ytrain)

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.080000000000000002, max_features=None,
        min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
 ...f=True, vocabulary=None)), ('classify', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])
Mean score: 0.840643504006
Std Dev:    0.00657897083178


In [235]:
from sklearn.metrics import accuracy_score
pred = model_nb.predict(xtest)
accuracy_score(ytest, pred)

0.95684371807967317

In [241]:
preds = model_nb.predict_proba(xtrain)
ytrain = pd.DataFrame(ytrain)

df = pd.concat([xtrain, ytrain, pd.DataFrame(preds)], axis=1)
ytrain = np.ravel(ytrain)

In [247]:
df.columns = ['text', 'author', 0, 1, 2]

In [248]:
# df.mean()+df.std()
df['good'] = np.where((df[0] > .82) | (df[1] > .70) | (df[2] > .726), 1, 0)

df = df[df['good'] == 1]
df = df.dropna()
df.shape

(11101, 6)

In [249]:
pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

param_grid = [
    {
        'tfidf__max_df': np.arange(.01,.10,.01),
        'tfidf__min_df': [2,3,4],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l1', 'l2'],
        'classify__alpha': [.1, .2],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(df['text'], df['author'])

model_nb = grid.best_estimator_
print(model_nb)
cv = cross_val_score(model_nb, xtrain, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.060000000000000005, max_features=None,
        min_df=2, ngram_range=(1, 2), norm='l1', preprocessor=None,
 ...f=True, vocabulary=None)), ('classify', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))])
Mean score: 0.551427992308
Std Dev:    0.00987081164312


In [250]:
from sklearn.metrics import accuracy_score
pred = model_nb.predict(xtest)
accuracy_score(ytest, pred)

0.3664453524004086

# Embeddings without NN

In [196]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPool1D
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

# set parameters:
max_features = 20000
maxlen = 400
embedding_dims = 50

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPool1D
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


# set parameters:
max_features = 20000
maxlen = 400
embedding_dims = 50


# Fit tokenizer
x = train_all.text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x)

# Tokenize training and validation data
sequences = tokenizer.texts_to_sequences(xtrain)
sequences_test = tokenizer.texts_to_sequences(xval)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
data_test = pad_sequences(sequences_test, maxlen=maxlen)


print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', ytrain.shape)

print('Shape of data tensor:', data_test.shape)
print('Shape of label tensor:', ytest.shape)

Found 25943 unique tokens.
Shape of data tensor: (15663, 400)
Shape of label tensor: (15663,)
Shape of data tensor: (3916, 400)
Shape of label tensor: (3916,)


In [197]:
from xgboost import XGBClassifier

pl = Pipeline([
    ('classify', XGBClassifier())
])

param_grid = [
    {
        'classify__max_depth': [2,3,4],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(data, ytrain)

model_xg = grid.best_estimator_
print(model_xg)
cv = cross_val_score(model_xg, data, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('classify', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])
Mean score: 0.484454303463
Std Dev:    0.00582836047196


In [199]:
from xgboost import XGBClassifier

pl = Pipeline([
    ('classify', LogisticRegression())
])

param_grid = [
    {
        'classify__C': [.01, .1, 1, 10, 100],
    },
]

grid =\
GridSearchCV(pl, cv=kf, n_jobs=-1, param_grid=param_grid, scoring='accuracy')\
.fit(data, ytrain)

model_xg = grid.best_estimator_
print(model_xg)
cv = cross_val_score(model_xg, data, ytrain, cv=kf, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

Pipeline(memory=None,
     steps=[('classify', LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
Mean score: 0.402923950861
Std Dev:    0.00479758507355


# Convolutional Neural Network

In [39]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPool1D
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

# set parameters:
max_features = 20000
maxlen = 400
embedding_dims = 50

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Fit tokenizer
x = train_all.ingredients
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x)

# Tokenize training and validation data
sequences = tokenizer.texts_to_sequences(xtrain)
sequences_test = tokenizer.texts_to_sequences(xval)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
data_test = pad_sequences(sequences_test, maxlen=maxlen)

labels = np.array(pd.get_dummies(ytrain))
labels_test = np.array(pd.get_dummies(yval))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

print('Shape of data tensor:', data_test.shape)
print('Shape of label tensor:', labels_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 6714 unique tokens.
Shape of data tensor: (31819, 400)
Shape of label tensor: (31819, 20)
Shape of data tensor: (7955, 400)
Shape of label tensor: (7955, 20)


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Fit tokenizer
x = train_all.text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x)

# Tokenize training and validation data
sequences = tokenizer.texts_to_sequences(xtrain)
sequences_test = tokenizer.texts_to_sequences(xval)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
data_test = pad_sequences(sequences_test, maxlen=maxlen)

labels = np.array(pd.get_dummies(ytrain))
labels_test = np.array(pd.get_dummies(yval))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

print('Shape of data tensor:', data_test.shape)
print('Shape of label tensor:', labels_test.shape)

In [40]:
x_train, x_test, y_train, y_test =\
train_test_split(data, labels, test_size=.2)

In [41]:
batch_size = 64
epochs = 5
filters = 512
kernel_size = 3
hidden_dims = 250

from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_acc', patience=0)

model_cnn = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model_cnn.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model_cnn.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model_cnn.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

model_cnn.add(GlobalMaxPooling1D())

# Fully connected layer
model_cnn.add(Dense(hidden_dims))
model_cnn.add(Dropout(0.2))
model_cnn.add(Activation('relu'))

model_cnn.add(Dense(3))
model_cnn.add(Activation('softmax'))

model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_cnn.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs, callbacks=[es],
          validation_data=(x_test, y_test))

TypeError: softmax() got an unexpected keyword argument 'axis'

In [34]:
# Not necessary -- we included validation data while fitting the model

# score, acc = model_cnn.evaluate(x_test, y_test,
#                             batch_size=batch_size)
# print('Test score:', score)
# print('Test accuracy:', acc)

Test accuracy: 0.784039900428


# VDCNN

In [180]:
batch_size = 64
epochs = 5
filters = 1024
kernel_size = 3
hidden_dims = 250

from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_acc', patience=0)

model_cnn = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model_cnn.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model_cnn.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
for i in range(6):
    model_cnn.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model_cnn.add(Dropout(0.2))
    model_cnn.add(MaxPool1D(pool_size=2, strides=2))

model_cnn.add(GlobalMaxPooling1D())

# Fully connected layer
model_cnn.add(Dense(hidden_dims))
model_cnn.add(Dropout(0.2))
model_cnn.add(Activation('relu'))

model_cnn.add(Dense(3))
model_cnn.add(Activation('softmax'))

model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_cnn.summary()
# model_cnn.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs, callbacks=[es],
#           validation_data=(x_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 400, 50)           1000000   
_________________________________________________________________
dropout_125 (Dropout)        (None, 400, 50)           0         
_________________________________________________________________
conv1d_108 (Conv1D)          (None, 398, 1024)         154624    
_________________________________________________________________
dropout_126 (Dropout)        (None, 398, 1024)         0         
_________________________________________________________________
max_pooling1d_50 (MaxPooling (None, 199, 1024)         0         
_________________________________________________________________
conv1d_109 (Conv1D)          (None, 197, 1024)         3146752   
_________________________________________________________________
dropout_127 (Dropout)        (None, 197, 1024)         0         
__________

In [160]:
import tensorflow as tf

from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers import Input, Dense, Dropout, Lambda
from keras.layers.pooling import MaxPooling1D
from keras.optimizers import SGD

from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers.normalization import BatchNormalization
from keras.layers import Activation


class ConvBlockLayer(object):
    """
    two layer ConvNet. Apply batch_norm and relu after each layer
    """

    def __init__(self, input_shape, num_filters):
        self.model = Sequential()
        # first conv layer
        self.model.add(Conv1D(filters=num_filters, kernel_size=3, strides=1, padding="same", input_shape=input_shape))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))

        # second conv layer
        self.model.add(Conv1D(filters=num_filters, kernel_size=3, strides=1, padding="same"))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))

    def __call__(self, inputs):
        return self.model(inputs)
import numpy as np


ALPHABET = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'\"/\\|_@#$%^&*~`+ =<>()[]{}"  # len: 69
FEATURE_LEN = 512


def get_char_dict():
    cdict = {}
    for i, c in enumerate(ALPHABET):
        cdict[c] = i + 2

    return cdict


def get_comment_ids(text, max_length=FEATURE_LEN):
    array = np.ones(max_length)
    count = 0
    cdict = get_char_dict()

    for ch in text:
        if ch in cdict:
            array[count] = cdict[ch]
            count += 1

        if count >= FEATURE_LEN - 1:
            return array

    return array


def to_categorical(y, nb_classes=None):
    y = np.asarray(y, dtype='int32')

    if not nb_classes:
        nb_classes = np.max(y) + 1

    Y = np.zeros((len(y), nb_classes))
    for i in range(len(y)):
        Y[i, y[i]] = 1.

    return Y


def get_conv_shape(conv):
    return conv.get_shape().as_list()[1:]   

def build_model(num_filters, num_classes, sequence_max_length=512,
                num_quantized_chars=71, embedding_size=16, learning_rate=0.001,
                top_k=3, model_path=None):

    inputs = Input(shape=(sequence_max_length, ), dtype='int32', name='inputs')

    embedded_sent = Embedding(num_quantized_chars, embedding_size, input_length=sequence_max_length)(inputs)

    # First conv layer
    conv = Conv1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_sent)

    # Each ConvBlock with one MaxPooling Layer
    for i in range(len(num_filters)):
        conv = ConvBlockLayer(get_conv_shape(conv), num_filters[i])(conv)
        conv = MaxPooling1D(pool_size=3, strides=2, padding="same")(conv)

    # k-max pooling (Finds values and indices of the k largest entries for the last dimension)
    def _top_k(x):
        x = tf.transpose(x, [0, 2, 1])
        k_max = tf.nn.top_k(x, k=top_k)
        return tf.reshape(k_max[0], (-1, num_filters[-1] * top_k))
    k_max = Lambda(_top_k, output_shape=(num_filters[-1] * top_k,))(conv)

    # 3 fully-connected layer with dropout regularization
    fc1 = Dropout(0.2)(Dense(512, activation='relu', kernel_initializer='he_normal')(k_max))
    fc2 = Dropout(0.2)(Dense(512, activation='relu', kernel_initializer='he_normal')(fc1))
    fc3 = Dense(num_classes, activation='softmax')(fc2)

    # define optimizer
    sgd = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=False)
    model = Model(inputs=inputs, outputs=fc3)
    model.compile(optimizer=sgd, loss='mean_squared_error', metrics=['accuracy'])

    if model_path is not None:
        model.load_weights(model_path)

    return model

In [162]:
model = build_model(num_filters=256, num_classes=3)

TypeError: object of type 'int' has no len()

# Recurrent Neural Network

In [18]:
from keras.layers import LSTM

model_rnn = Sequential()
model_rnn.add(Embedding(max_features, 128))
model_rnn.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Fully connected layer
model_rnn.add(Dense(128))
model_rnn.add(Dropout(0.2))
model_rnn.add(Activation('relu'))

model_rnn.add(Dense(3, activation='softmax'))

model_rnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_rnn.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=15, callbacks=[es],
          validation_data=(x_test, y_test))

Train...
Train on 12530 samples, validate on 3133 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15


<keras.callbacks.History at 0x121816b3c88>

In [None]:
# score, acc = model_rnn.evaluate(x_test, y_test,
#                             batch_size=batch_size)
# print('Test score:', score)
# print('Test accuracy:', acc)

In [19]:
# TODO:
# Ensemble the 3 models

from sklearn.metrics import accuracy_score

In [20]:
pred_nb = model_nb.predict(xval)
pred_cnn = model_cnn.predict_classes(data_test)
pred_rnn = model_rnn.predict_classes(data_test)



In [21]:
pred_cnn_oh = []
for i in pred_cnn:
    if i == 0:
        pred_cnn_oh.append([1,0,0])
    elif i == 1:
        pred_cnn_oh.append([0,1,0])
    else:
        pred_cnn_oh.append([0,0,1])
        
accuracy_score(labels_test, np.array(pred_cnn_oh))

0.30132788559754853

In [22]:
pred_rnn_oh = []
for i in pred_rnn:
    if i == 0:
        pred_rnn_oh.append([1,0,0])
    elif i == 1:
        pred_rnn_oh.append([0,1,0])
    else:
        pred_rnn_oh.append([0,0,1])
        
accuracy_score(labels_test, np.array(pred_rnn_oh))

0.81154239019407559

In [23]:
rnn = np.array(pred_rnn_oh)
cnn = np.array(pred_cnn_oh)
nb = np.array(pred_nb)

In [24]:
ensemble_pred = rnn*.3 + cnn*.3 + .4*pd.get_dummies(nb)

pred_final = pd.DataFrame()
pred_final['EAP'] = np.where(ensemble_pred['EAP'] > .5, 1, 0)
pred_final['HPL'] = np.where(ensemble_pred['HPL'] > .5, 1, 0)
pred_final['MWS'] = np.where(ensemble_pred['MWS'] > .5, 1, 0)

In [25]:
preds = []
for i in np.array(pred_final):
    if i[0] == 1:
        preds.append("EAP")
    elif i[1] == 1:
        preds.append("HPL")
    else:
        preds.append("MWS")

In [27]:
comp = pd.DataFrame()

comp['xg'] = model_nb.predict(xval)
comp['ensemble_pred'] = pd.Series(preds)

comp.head(20)

Unnamed: 0,xg,ensemble_pred
0,EAP,EAP
1,MWS,MWS
2,HPL,HPL
3,HPL,HPL
4,EAP,EAP
5,HPL,HPL
6,MWS,MWS
7,HPL,HPL
8,EAP,MWS
9,MWS,MWS


In [93]:
sequences_pred = tokenizer.texts_to_sequences(test.text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_pred = pad_sequences(sequences_pred, maxlen=maxlen)

Found 19743 unique tokens.


In [96]:
pred_xg_2 = model_xg.predict(test.text)
pred_cnn_2 = model_cnn.predict_classes(data_pred)
pred_rnn_2 = model_rnn.predict_classes(data_pred)



In [97]:
pred_cnn_oh_2 = []
for i in pred_cnn_2:
    if i == 0:
        pred_cnn_oh_2.append([1,0,0])
    elif i == 1:
        pred_cnn_oh_2.append([0,1,0])
    else:
        pred_cnn_oh_2.append([0,0,1])

In [98]:
pred_rnn_oh_2 = []
for i in pred_rnn_2:
    if i == 0:
        pred_rnn_oh_2.append([1,0,0])
    elif i == 1:
        pred_rnn_oh_2.append([0,1,0])
    else:
        pred_rnn_oh_2.append([0,0,1])

In [121]:
rnn2 = np.array(pred_rnn_oh_2)
cnn2 = np.array(pred_cnn_oh_2)
xg2 = np.array(pred_xg_2)

ensemble_pred2 = rnn2*.0 + cnn2*.0 + 1*pd.get_dummies(xg2)

pred_final2 = pd.DataFrame()
pred_final2['EAP'] = np.where(ensemble_pred2['EAP'] > .5, 1, 0)
pred_final2['HPL'] = np.where(ensemble_pred2['HPL'] > .5, 1, 0)
pred_final2['MWS'] = np.where(ensemble_pred2['MWS'] > .5, 1, 0)

In [122]:
pred_final2['id'] = test['id']
pred_final2 = pred_final2.set_index('id')
pred_final2

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0,0,1
id24541,1,0,0
id00134,0,1,0
id27757,0,1,0
id04081,1,0,0
id27337,1,0,0
id24265,1,0,0
id25917,0,0,1
id04951,1,0,0
id14549,1,0,0


In [123]:
pred_final2.to_csv('preds_vfavilla.csv')