In [1]:
import numpy as np
import pandas as pd

# Reproduciblity

In [2]:
import os
import random

seed = 42
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)

# Loading the data

In [3]:
import json

sep = ', '
def Dataset(path):
    dataset_json = json.load(open(path, 'r'))
    
    return pd.DataFrame(data=[{
        'id': record['id'],
        'X': sep.join(record['ingredients']),
        'y': record.get('cuisine', '')
    } for record in dataset_json])

def train_ds():
    return Dataset('data/train.json')

def test_ds():
    return Dataset('data/test.json')

# Model (`hybrid`)

Inasfar as the model itself is concerned, we will first vectorize the ingredients into sparse features, then use a combination of SVD and "sparse" models for each cuisine class to construct a dense representation, and then train a "dense" model on this representation. I hope this doesn't qualify as "a primitive heurestic based on single ingredients" that we were supposed to avoid.

Other than that, there were few other reasons behind this particular model:
- I principally wanted to experiment with "multi-stage" models;
- I wanted it to be trainable on my laptop without a GPU;
- I wanted to see whether such a (comparatively) simple model would yield good results.

I will also observe that I couldn't think of any _useful_ visualisation or a graph that could be used for this task; the extent of the "aesthetic" features in this notebook are the progress bars (using the library `tqdm`).

The score on Kaggle was 0.79877, not sure whether it's bad or not.

## Ingredient vectorization

Of course, we need to numerically represent the text; for that purpose, we will extract (lemmatized) words and related tokens from the ingredients' lists and take (1-10)-grams of them as the features. This uses an "external source" (`nltk` and `wordnet` for lemmatization).

In [4]:
import nltk
nltk.download('wordnet')

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

def concat(xs):
    return {x for lst in xs for x in lst}

class Analyzer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def analyze_one(self, ingredient):
        words = ingredient.lower().split(' ')     
        lemmatized = [self.lemmatizer.lemmatize(word)
                      for word in words]
        grams = {' '.join(gram)
                 for n in range(1, 10)
                 for gram in ngrams(lemmatized, n)}
        return {*grams, ingredient}
    
    def __call__(self, ingredients_str):
        ingredients = ingredients_str.split(sep)
        tokens = concat(self.analyze_one(ingredient)
                         for ingredient in ingredients)
        return list(tokens)

[nltk_data] Downloading package wordnet to /home/talos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
def Vectorizer():
    return CountVectorizer(analyzer=Analyzer())

## Model fitting utils

We will use a number of helper functions to (hyper)fit the models.

In [6]:
from skopt import BayesSearchCV
from dataclasses import dataclass
from typing import Dict
from tqdm.auto import tqdm

def bayes_cv(est, search_spaces, **extras):
    return BayesSearchCV(est, search_spaces, **{
        'refit': True, 'verbose': 0, 'n_jobs': -1, 'cv': 5,
        'random_state': seed, **extras
    })

def nest(prefix, search_spaces):
    return [({'{}__{}'.format(prefix, name): value
              for name, value in grid.items()}, n_iter)
            for grid, n_iter in search_spaces]

def hyperfit(desc, cv, X, y=None):
    bar = iter(tqdm(range(cv.total_iterations),
                    desc=desc))
    
    def cb(_):
        nonlocal bar
        next(bar)
        
    cv.fit(X, y, callback=cb)
    for _ in bar: pass
    
    print('Score: {}'.format(cv.best_score_))
    return cv

## Reference hyperparameters

A repository of sorts for the hyperparameter ranges and base models. Most of them won't be used, but I wanted to keep the hyperparameters around regardless.

In [7]:
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

def SparseLR(n_iter=100):
    est = LogisticRegression(random_state=seed,
                             penalty='l2',
                             solver='liblinear')
    
    return bayes_cv(est, [({
        'C': (1e-4, 1e4, 'log-uniform'),
    }, n_iter)], scoring='roc_auc')

def SparseLSVC(n_iter=75):
    est = LinearSVC(random_state=seed, penalty='l1', solver='hinge', dual=False)
    
    return bayes_cv(est, [({
        'C': (1e-4, 1e4, 'log-uniform'),
    }, n_iter)], scoring='roc_auc')

def SparseLGBM(n_iter=50):
    est = LGBMClassifier(random_state=seed)
    
    return bayes_cv(est, [({
        'boosting_type': ['dart', 'gbdt'],
        'num_leaves': np.geomspace(15, 128, 10, dtype=int),
        'learning_rate': (0.005, 0.75, 'log-uniform'),
        'n_estimators': np.geomspace(100, 1000, 25, dtype=int),
        'colsample_bytree': (0.5, 1, 'uniform'),
        'min_child_weight': (1e-4, 1e-1, 'log-uniform'),
        'min_child_samples': np.geomspace(20, 250, 10, dtype=int),
        'subsample': (0.5, 1, 'uniform')
    }, n_iter)], scoring='roc_auc')

def DenseLGBM(n_iter=10):
    est = LGBMClassifier(random_state=seed)
    
    return bayes_cv(est, [({
        'boosting_type': ['dart', 'gbdt'],
        'num_leaves': np.geomspace(15, 128, 10, dtype=int),
        'learning_rate': (0.005, 0.75, 'log-uniform'),
        'n_estimators': np.geomspace(100, 300, 25, dtype=int),
        'colsample_bytree': (0.5, 1, 'uniform'),
        'min_child_weight': (1e-4, 1e-1, 'log-uniform'),
        'min_child_samples': np.geomspace(20, 250, 10, dtype=int),
        'subsample': (0.5, 1, 'uniform')
    }, n_iter)], scoring='accuracy')

def DenseXGB(n_iter=10):
    est = XGBClassifier(random_state=seed, nthread=-1, objective='multi:softprob',
                        eval_metric='logloss', use_label_encoder=False, booster='dart')
    
    return bayes_cv(est, [({
        'n_estimators': np.geomspace(100, 300, 25, dtype=int),
        'learning_rate': (0.05, 1, 'log-uniform'),
        'subsample': (0.5, 1, 'uniform'),
        'max_depth': np.geomspace(2, 8, 4, dtype=int),
        'colsample_bytree': (0.5, 1, 'uniform'),
        'min_child_weight': (1, 5)
    }, n_iter)], scoring='accuracy')

def DenseRT(n_iter=10):
    est = RandomForestClassifier(random_state=seed)
    
    return bayes_cv(est, [({
        'n_estimators': np.geomspace(100, 300, 25, dtype=int),
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': np.geomspace(1, 50, 8, dtype=int),
        'min_samples_leaf': np.geomspace(1, 50, 8, dtype=int)
    }, n_iter)], scoring='accuracy')

def DenseET(n_iter=10):
    est = ExtraTreesClassifier(random_state=seed)
    
    return bayes_cv(est, [({
        'n_estimators': np.geomspace(100, 300, 25, dtype=int),
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': np.geomspace(1, 50, 8, dtype=int),
        'min_samples_leaf': np.geomspace(1, 50, 8, dtype=int)
    }, n_iter)], scoring='accuracy')

## Sparse models

A word of note: I decided to backup the models and CV results for further analysis, but they weigh quite a lot so I don't attach them. Also, 100 iterations for the hyperparameter search may be an overkill, especially since there's essentially only the penalty parameter `C`. 

In [8]:
def sparse_model(shelf, label, prep_X, y=None):
    prep_y = y.apply(lambda x: int(x == label))
    entry = 'sparse/{}/lr'.format(label)
    if entry not in shelf:
        model_cv = hyperfit(entry, SparseLR(), prep_X, prep_y)
        shelf[entry] = model_cv
    else:
        model_cv = shelf[entry]
    
    return model_cv.best_estimator_

## Dense model

In [9]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin

class SparseReducer(BaseEstimator, TransformerMixin):
    def __init__(self, code, model):
        self.code = code
        self.model = model
    
    def fit(self, X, y=None):
        yt = np.where(y == self.code, 1, 0)
        self.model.fit(X, yt)
        return self
    
    def transform(self, X, y=None):
        for method in ['predict_proba', 'decision_function', 'predict']:
            if hasattr(self.model, method):
                Xt = getattr(self.model, method)(X)
                if len(Xt.shape) == 1:
                    Xt = Xt.reshape((Xt.shape[0], 1))
                return Xt

def DenseHead(reducers):
    base = DenseLGBM()
    est = Pipeline([
        ('reducer', FeatureUnion(reducers, n_jobs=-1)),
        ('dense', base.estimator)
    ])
    grid = nest('dense', base.search_spaces)
    
    return bayes_cv(est, grid, scoring='accuracy')
            
class DenseModel:
    def __init__(self, shelf):
        self.prep = Pipeline([
            ('vect', Vectorizer()),
            ('tfidf', TfidfTransformer())
        ])
        self.encoder = LabelEncoder()
        self.shelf = shelf
    
    def fit(self, X, y=None):
        prep_X = self.prep.fit_transform(X, y)
        prep_y = self.encoder.fit_transform(y)
        
        reducers = [('svd', TruncatedSVD(n_components=60))]
        for code, label in tqdm(list(enumerate(self.encoder.classes_)), desc='Sparse models'):
            model = sparse_model(self.shelf, label, prep_X, y)
            reducer = SparseReducer(code, model)
            reducers.append((label, reducer))
        
        if 'dense' not in self.shelf:
            final_cv = hyperfit('dense', DenseHead(reducers), prep_X, prep_y)
            self.shelf['dense'] = final_cv
        else:
            final_cv = self.shelf['dense']
        
        self.final = final_cv.best_estimator_
        return self
    
    def predict(self, X):
        prep_X = self.prep.transform(X)
        predictions = self.final.predict(prep_X)
        return self.encoder.inverse_transform(predictions)

## Main

In [10]:
import uuid
import shelve

def main(uid=None, clear=False):
    if uid is None:
        uid = str(uuid.uuid4())
    print('UID: {}'.format(uid))
    
    with shelve.open('results/{}.shelf'.format(uid), writeback=True) as shelf:
        if clear:
            shelf.clear()
        
        if 'train' not in shelf:
            train = train_ds().sample(frac=1)
            shelf['train'] = train
        else:
            train = shelf['train']
            
        if 'test' not in shelf:
            test = test_ds()
            shelf['test'] = test
        else:
            test = shelf['test']

        model = DenseModel(shelf)
        model.fit(train.X, train.y)
        test['cuisine'] = model.predict(test.X)
        test[['id', 'cuisine']].to_csv('results/{}.csv'.format(uid),
                                       index=False)

In [11]:
main(uid='hybrid', clear=False)

UID: hybrid


Sparse models:   0%|          | 0/20 [00:00<?, ?it/s]

dense:   0%|          | 0/10 [00:00<?, ?it/s]

Score: 0.7974053401719716
