# Exploration
Author contact: [adam_lewandowski_1998@outlook.com](mailto:adam_lewandowski_1998@outlook.com)

In [None]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)


Load table

In [None]:
import pandas as pd


def load_table():
    path = "../Webscrappers/out/results_classification.csv"
    df = pd.read_csv(path)
    return df


df = load_table()


Show table

In [None]:
df


Remove unimportant labels with recipies

In [None]:
df = df.drop(columns=["Favorites ", "*Member Recipes"])
to_remove_empty = (df[df.columns[1:]] == 0).all(axis=1)
df = df[~to_remove_empty]
df


Show information about non-null counts with assigned types

In [None]:
df.info()


Load spacy models

In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")
docs = list(nlp.pipe(df.object))

Show lemmas

In [None]:
def show_lemmas():
    lemma = docs[0][:].lemma_

    to_show = []
    for d in docs:
        l = str(d[:].lemma_)
        t = str(d[:])
        if l != t:
            to_show.append({"lemma": l, "original": t})

    df = pd.DataFrame(to_show)
    return df


show_lemmas()


Show entities

In [None]:
def show_entities():
    lemma = docs[0][:].lemma_

    entities = pd.DataFrame(
        [
            {"doc_id": doc_id, "entity": str(e), "type": str(e.label_)}
            for doc_id, d in enumerate(docs)
            for e in d.ents
        ]
    )

    return entities


entities = show_entities()
entities


Entities types counts

In [None]:
entities.type.value_counts(normalize=True).round(4)



Entities with types counts

In [None]:
entities[["entity", "type"]].value_counts(normalize=True)



Show most similar docs

In [None]:
import numpy as np


def show_similar_docs():
    vecs = np.array([d.vector / d.vector_norm for d in docs])

    similarities = vecs @ vecs.T
    diagonal = np.eye(len(similarities)).astype(bool)
    similarities[diagonal] = 0
    most_similar_args = np.argwhere(similarities > 0.9)

    text = np.array([str(d) for d in docs])
    df = pd.DataFrame(
        text[most_similar_args], columns=["doc_1", "doc_2"]
    ).query("doc_1 != doc_2")

    return df


show_similar_docs()


# Models

## BOW model

Encode `X` and `y`

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

X = df.object.values

y = df[df.columns[1:]]
y = [
    [cat for cat, val in row.items() if val > 0] for row in y.to_dict("records")
]
y_encoder = MultiLabelBinarizer()
y = y_encoder.fit_transform(y)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer


def lemmatize(x):
    import spacy

    required_pos = ("NOUN", "VERB", "PROPN", "ADJ")

    nlp = spacy.load("en_core_web_lg")
    docs = (d for d in nlp.pipe(x))
    lemmas = [
        [
            tok.lemma_.lower()
            for tok in d
            if tok.is_alpha and tok.pos_ in required_pos
        ]
        for d in docs
    ]
    # remove empty and convert to string
    lemmas = [" ".join(d) for d in lemmas if len(d) > 0]
    return lemmas


preprocessing = Pipeline(
    [
        ("preprocessor", FunctionTransformer(lemmatize),),
        (
            "vectorizer",
            TfidfVectorizer(binary=False, lowercase=False, norm="l1"),
        ),
    ],
    verbose=True,
)
model_1 = LogisticRegression(
    n_jobs=-1,
    class_weight="balanced",
    penalty="elasticnet",
    l1_ratio=0.2,
    solver="saga",
    max_iter=10_000
)
model_2 = SVC(probability=True, kernel='sigmoid')
voting = VotingClassifier(
    estimators=[("model_1", model_1), ("model_2", model_2)],
    voting="soft",
    n_jobs=-1,
)
model_ensemble = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("multi-class mult-label", MultiOutputClassifier(voting, n_jobs=-1)),
    ],
    verbose=True,
)
model_ensemble


In [None]:
m = model_ensemble.fit(X, y)
print(m.score(X, y))
y_hat = m.predict(["christmas cake chicken salad", "lite pancake", "salad"])
y_encoder.inverse_transform(y_hat)

Based on the mean results of `cross validation` we will first get the best architecture, then extract the best model.

In [None]:
import warnings
from sklearn.model_selection import learning_curve

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    train_sizes, train_scores, valid_scores = learning_curve(
        model_ensemble, X, y, n_jobs=-1, verbose=True, shuffle=True
    )

Architecture selection based on training score.

In [None]:
np.nan_to_num(train_scores, 0).mean(1)


Architecture selection based on test score.

In [None]:
np.nan_to_num(valid_scores, 0).mean(1)


In [None]:
from sklearn.model_selection import KFold

def get_best_cv_model(model):
    best = {'score': 0, 'model': None}
    kf = KFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        if score > best['score']:
            best = {'score': score, 'model': model}

    return best

best_model = get_best_cv_model(model_ensemble)

In [None]:
import joblib
setattr(best_model['model'], 'y_encoder', y_encoder)
joblib.dump(best_model['model'], '../out/recipy_category_model.joblib')

Test if we can load

In [None]:
import joblib

def lemmatize(x):
    import spacy

    required_pos = ("NOUN", "VERB", "PROPN", "ADJ")

    nlp = spacy.load("en_core_web_lg")
    docs = (d for d in nlp.pipe(x))
    lemmas = [
        [
            tok.lemma_.lower()
            for tok in d
            if tok.is_alpha and tok.pos_ in required_pos
        ]
        for d in docs
    ]
    # remove empty and convert to string
    lemmas = [" ".join(d) for d in lemmas if len(d) > 0]
    return lemmas

clf = joblib.load('../out/recipy_category_model.joblib') 

Prediction test

In [None]:
out = clf.predict(['chicken salad', 'christmas lite salad'])
clf.y_encoder.inverse_transform(out)