In [1]:
import pandas as pd
import numpy as np
import re

# preprocess
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import GenericUnivariateSelect, chi2


from sklearn.feature_selection import GenericUnivariateSelect, chi2
from sklearn.model_selection import train_test_split

# model
from sklearn.neural_network import MLPClassifier

import tensorflow as tf

from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt

In [2]:
# load raw data csv files
"""
attributes: name, n_steps, n_ingredients, steps, ingredients
"""
df_train = pd.read_csv("resources/datasets/recipe_train.csv")
df_test = pd.read_csv("resources/datasets/recipe_test.csv")

In [3]:
def preprocess(df):
    arr_ingr = df['ingredients'].copy().to_numpy()
    arr_steps = df['steps'].copy().to_numpy()
    arr_name = df['name'].copy().to_numpy()
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    ## remove all puntuation
    for i in range(len(arr_steps)):
        arr_steps[i] = re.sub(r'[^\w\s]', '', str(arr_steps[i]))
        arr_steps[i] = _RE_COMBINE_WHITESPACE.sub(" ", arr_steps[i]).strip() + ' '
        arr_ingr[i] = re.sub(r'[^\w\s]', '', str(arr_ingr[i]))
        arr_ingr[i] = _RE_COMBINE_WHITESPACE.sub(" ", arr_ingr[i]).strip() + ' '
        arr_name[i] = re.sub(r'[^\w\s]', '', str(arr_name[i]))
        arr_name[i] = _RE_COMBINE_WHITESPACE.sub(" ", arr_name[i]).strip()

    # combined all three features
    X = arr_steps + arr_ingr + arr_name
    return X

In [4]:
X = preprocess(df_train)
y = df_train['duration_label']

# include both uni-grams and bi-grams
# exclude stop words
vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), analyzer='word', stop_words= 'english')
X = vectorizer.fit_transform(X)

print("Shape of X (nrow, ncol):", X.shape)

Shape of X (nrow, ncol): (40000, 580978)


In [5]:
fselect = GenericUnivariateSelect(chi2, mode='percentile', param=20)
X_new = fselect.fit_transform(X, y)
X_new.shape

(40000, 116196)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=20)
mlp = MLPClassifier(random_state=1, max_iter=200, verbose=True)
mlp.fit(X_train, y_train)

Iteration 1, loss = 0.65022812
Iteration 2, loss = 0.31249984
Iteration 3, loss = 0.16037653
Iteration 4, loss = 0.08124064
Iteration 5, loss = 0.04354741
Iteration 6, loss = 0.02616517
Iteration 7, loss = 0.01811297
Iteration 8, loss = 0.01424194
Iteration 9, loss = 0.01205843
Iteration 10, loss = 0.01066542
Iteration 11, loss = 0.00976182
Iteration 12, loss = 0.00912615
Iteration 13, loss = 0.00860043
Iteration 14, loss = 0.00805948
Iteration 15, loss = 0.00767774
Iteration 16, loss = 0.00732637
Iteration 17, loss = 0.00700503
Iteration 18, loss = 0.00670655
Iteration 19, loss = 0.00642512
Iteration 20, loss = 0.00616396




MLPClassifier(random_state=1, verbose=True)

In [9]:
mlp.score(X_test, y_test)

0.95

In [14]:
X_train = preprocess(df_train)
X_test = preprocess(df_test)
y_train = df_train['duration_label']
X = np.concatenate((X_train, X_test), axis=0)

# transform into sparse
vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), analyzer='word', stop_words= 'english')
vectorizer.fit(X)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# feature selection
fselect = GenericUnivariateSelect(chi2, mode='percentile', param=20)
fselect.fit(X_train, y_train)
X_train_new = fselect.transform(X_train)
X_test_new = fselect.transform(X_test)

In [20]:
mlp = MLPClassifier(max_iter=200, verbose=True, early_stopping=True)
mlp.fit(X_train_new, y_train)

Iteration 1, loss = 0.64523725
Validation score: 0.825250
Iteration 2, loss = 0.31237514
Validation score: 0.853500
Iteration 3, loss = 0.15228023
Validation score: 0.863250
Iteration 4, loss = 0.07156563
Validation score: 0.864250
Iteration 5, loss = 0.03664685
Validation score: 0.864000
Iteration 6, loss = 0.02227228
Validation score: 0.862750
Iteration 7, loss = 0.01598737
Validation score: 0.860750
Iteration 8, loss = 0.01281167
Validation score: 0.861000
Iteration 9, loss = 0.01097917
Validation score: 0.859000
Iteration 10, loss = 0.00982452
Validation score: 0.858000
Iteration 11, loss = 0.00904910
Validation score: 0.855250
Iteration 12, loss = 0.00837114
Validation score: 0.854500
Iteration 13, loss = 0.00792431
Validation score: 0.851000
Iteration 14, loss = 0.00749891
Validation score: 0.850250
Iteration 15, loss = 0.00715423
Validation score: 0.847500
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(early_stopping=True, verbose=True)

In [21]:
predicts = mlp.predict(X_test_new)

In [22]:
ids = np.array(range(len(predicts))) + 1
output = pd.DataFrame({'id': ids, 'duration_label': predicts})
output.to_csv('output_test.csv', index=False)