In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [None]:
test = pd.read_json("/data/cuisine_data/test.json")
train = pd.read_json("/data/cuisine_data/train.json")
df = pd.concat([train,test],axis=0)

In [None]:
df.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [None]:
import nltk
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing


additional_stop_words = ["advertisement", "advertisements",
                         "cup", "cups",
                         "tablespoon", "tablespoons",
                         "teaspoon", "teaspoons",
                         "ounce", "ounces",
                         "salt",
                         "pepper",
                         "pound", "pounds",
                         ]

nltk.download('wordnet')
nltk.download("stopwords")

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()

    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                    lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)

    ## Remove digits
    text = ''.join([i for i in text if not i.isdigit()])

    ## remove mutliple space
    text = re.sub(' +', ' ', text)

    return text

def process_data(df):
    dataset = df

    def processing(row):
        ls = row['ingredients']
        return ' '.join(ls)

    dataset['ingredients'] = dataset.apply(lambda x: processing(x), axis=1)
    dataset.dropna(inplace=True)
    dataset = dataset.drop(columns=['id']).reset_index(drop=True)

    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    dataset["ingredients_query"] = dataset["ingredients"].apply(lambda x:
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True,
          lst_stopwords=stop_word_list))
    return dataset

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = process_data(df)

In [None]:
df

Unnamed: 0,cuisine,ingredients
0,greek,romaine lettuce black olives grape tomatoes ga...
1,southern_us,plain flour ground pepper salt tomatoes ground...
2,filipino,eggs pepper salt mayonaise cooking oil green c...
3,indian,water vegetable oil wheat salt
4,indian,black pepper shallots cornflour cayenne pepper...
...,...,...
39769,irish,light brown sugar granulated sugar butter warm...
39770,italian,KRAFT Zesty Italian Dressing purple onion broc...
39771,irish,eggs citrus fruit raisins sourdough starter fl...
39772,chinese,boneless chicken skinless thigh minced garlic ...


In [None]:
df['encoded_text'] = df['cuisine'].astype('category').cat.codes
df.head(10)

Unnamed: 0,cuisine,ingredients,encoded_text
0,greek,romaine lettuce black olives grape tomatoes ga...,6
1,southern_us,plain flour ground pepper salt tomatoes ground...,16
2,filipino,eggs pepper salt mayonaise cooking oil green c...,4
3,indian,water vegetable oil wheat salt,7
4,indian,black pepper shallots cornflour cayenne pepper...,7
5,jamaican,plain flour sugar butter eggs fresh ginger roo...,10
6,spanish,olive oil salt medium shrimp pepper garlic cho...,17
7,italian,sugar pistachio nuts white almond bark flour v...,9
8,mexican,olive oil purple onion fresh pineapple pork po...,13
9,italian,chopped tomatoes fresh basil garlic extra-virg...,9


In [None]:
data_texts = df['ingredients'].to_list()

data_labels = df['encoded_text'].to_list()

In [None]:
label_names = df.groupby('cuisine').agg({'encoded_text': 'first'}).reset_index()
label_names['encoded_text'] = label_names.index

label_names = label_names[['cuisine', 'encoded_text']].values.tolist()

label_names

[['brazilian', 0],
 ['british', 1],
 ['cajun_creole', 2],
 ['chinese', 3],
 ['filipino', 4],
 ['french', 5],
 ['greek', 6],
 ['indian', 7],
 ['irish', 8],
 ['italian', 9],
 ['jamaican', 10],
 ['japanese', 11],
 ['korean', 12],
 ['mexican', 13],
 ['moroccan', 14],
 ['russian', 15],
 ['southern_us', 16],
 ['spanish', 17],
 ['thai', 18],
 ['vietnamese', 19]]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cuisine       39774 non-null  object
 1   ingredients   39774 non-null  object
 2   encoded_text  39774 non-null  int8  
dtypes: int8(1), object(2)
memory usage: 660.4+ KB


### **Data Preparation**

In [None]:
df['encoded_text'] = df['encoded_text'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cuisine       39774 non-null  object
 1   ingredients   39774 non-null  object
 2   encoded_text  39774 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [None]:
df['encoded_text'].value_counts()

9     7838
13    6438
16    4320
7     3003
3     2673
5     2646
2     1546
18    1539
11    1423
6     1175
17     989
12     830
19     825
14     821
1      804
4      755
8      667
10     526
15     489
0      467
Name: encoded_text, dtype: int64

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
token = tokenizer.encode_plus(
    df['ingredients'].iloc[1],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
df

Unnamed: 0,cuisine,ingredients,encoded_text
0,greek,romaine lettuce black olives grape tomatoes ga...,6
1,southern_us,plain flour ground pepper salt tomatoes ground...,16
2,filipino,eggs pepper salt mayonaise cooking oil green c...,4
3,indian,water vegetable oil wheat salt,7
4,indian,black pepper shallots cornflour cayenne pepper...,7
...,...,...,...
39769,irish,light brown sugar granulated sugar butter warm...,8
39770,italian,KRAFT Zesty Italian Dressing purple onion broc...,9
39771,irish,eggs citrus fruit raisins sourdough starter fl...,8
39772,chinese,boneless chicken skinless thigh minced garlic ...,3


In [None]:
token

{'input_ids': <tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  6188, 15068,  1747, 18700,  6870, 26422,  1279,  1747,
         1602, 18700, 21153,  3263,  6471,  2448, 26422,  1279,  3431,
        11184,  7696,  6831, 17690,  2949,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['ingredients'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [None]:
labels = np.zeros((len(df), 20))
labels.shape

(39774, 20)

In [None]:
labels[np.arange(len(df)), df['encoded_text'].values] = 1 # one-hot encoded target tensor

In [None]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(20,), dtype=tf.float64, name=None))>

In [None]:
def CuisineDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(CuisineDatasetMapFunction) # converting to required format for tensorflow dataset

In [None]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(20,), dtype=tf.float64, name=None))>

In [None]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [None]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 20), dtype=tf.float64, name=None))>

In [None]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [None]:
train_size

1988

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

### **Model**

In [None]:
from transformers import TFBertModel

In [None]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(20, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

cuisine_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
cuisine_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [None]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
cuisine_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = cuisine_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [None]:
cuisine_model.save('cuisine_model',save_format="h5")
# tf.saved_model.save(cuisine_model, 'cuisine_classification_model')

### **Prediction**

In [None]:
cuisine_model = tf.keras.models.load_model('cuisine_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }


def make_prediction(model, processed_data, classes=['Brazilian', 'British', 'Cajun Creole', 'Chinese', 'Filipino', 'French', 'Greek', 'Indian','Irish', 'Italian', 'Jamaican', 'Japanese', 'Korean', 'Mexican', 'Moroccan','Russian', 'Southern US','Spanish', 'Thai', 'Vietnamese'], top_k=5):
    probs = model.predict(processed_data)[0]
    top_indices = np.argsort(probs)[-top_k:][::-1]
    top_predictions = [(classes[i], probs[i]) for i in top_indices]
    return top_predictions



In [None]:
input_text = input('Enter ingredients here: ')
processed_data = prepare_data(input_text, tokenizer)
results = make_prediction(cuisine_model, processed_data=processed_data)
results
# print("Top 5 Predicted Cuisine:")
# for i, (cuisine, probability) in enumerate(results, 1):
#     print(f"Cuisine: {cuisine}, Probability: {probability}")





Enter ingredients here: ramen seaweed charsiu


[('Japanese', 0.96810406),
 ('Korean', 0.012458365),
 ('Chinese', 0.0063414807),
 ('Vietnamese', 0.0021053778),
 ('Indian', 0.001913298)]