In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats
import random
import os
import tensorflow as tf
import math
import cv2

In [3]:
SIZE = 500

seed = 505
img_size = 256
img_path = 'kaggle_dataset/dataset_1_images/dataset_1_images/Food Images/'


In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    

In [5]:
def image_mapping_check(dataset):
    counter = 0
    record = []
    while counter < dataset.shape[0]-1:
        row = dataset.loc[counter]
        img_name = row['Image_Name']
        img = cv2.imread(img_path+img_name+'.jpg')
        try:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 
        except:
            record.append(counter)
        counter += 1
    new = dataset.drop(record, axis = 0)
    new = new.reset_index(drop = True)
    return new

In [6]:
path = ['kaggle_dataset/cleaned_dataset_1.csv', 'kaggle_dataset/ingredients_modified.csv']
df = [pd.read_csv(path[0]), pd.read_csv(path[1])]
df[0]['cleaned'] = df[1]['cleaned']
df[0]['Instructions'] = df[1]['Instructions']
df[0]['Title'] = df[1]['Title']
df = df[0]
df = df[df.Title.notna()]

In [7]:
df = image_mapping_check(df)

In [8]:
df.isnull().sum()

Title                  0
Instructions           0
Image_Name             0
Cleaned_Ingredients    0
dataset                0
id                     0
cleaned                0
dtype: int64

In [9]:
df[df['cleaned'].isna()]

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,dataset,id,cleaned


In [10]:
df[df.cleaned.str.len() == 1]

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,dataset,id,cleaned


In [None]:
def get_masked_tokens(ings):
    all_ings = ings.split

In [9]:
def get_dataset(df):
    
    x_train, x_val, y_train, y_val = train_test_split(df.loc[:, df.columns != 'cleaned'], df['cleaned'], shuffle = True, random_state = seed, test_size = 0.25)
    train_df = pd.concat([x_train, y_train], axis = 1).reset_index(drop = True)
    val_df = pd.concat([x_val, y_val], axis = 1).reset_index(drop = True)
    return df, train_df, val_df

In [10]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    
    if type(value) == str:
        
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str.encode(value)]))
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))



def _int64_feature(value):
    # value is a list of integers
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(img_name, img, cleaned, orig_ingredients, title, instructions):
    feature = {
        'image_name': _bytes_feature(img_name),
        'image': _bytes_feature(img),
        'cleaned_ingredients': _bytes_feature(cleaned),
        'orig_ingredients': _bytes_feature(orig_ingredients),
        'title': _bytes_feature(title),
        'instructions': _bytes_feature(instructions)
        
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [11]:
def create_recored(dataset, title):
    CT = dataset.shape[0]//SIZE + int(dataset.shape[0]%SIZE!=0)
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))
        CT2 = min(SIZE,dataset.shape[0]-j*SIZE)
        if title == "train":
            with tf.io.TFRecordWriter('train%.4i-%.2i-%i.tfrec'%(seed,j,CT2)) as writer:
                for k in range(CT2):
                    counter = j*SIZE+k
                    row = dataset.loc[counter]
                    img_name = row['Image_Name']
                    img = cv2.imread(img_path+img_name+'.jpg')
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                    img = cv2.resize(img,(img_size,img_size),interpolation = cv2.INTER_AREA)
                    img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                    example = serialize_example(
                        str.encode(img_name),
                        img,
                        row['cleaned'],
                        row['Cleaned_Ingredients'],
                        row['Title'],
                        row['Instructions']
                    )
                    writer.write(example)
                    if k%100==0: print(k,', ',end='')
        else:
             with tf.io.TFRecordWriter('val%.4i-%.2i-%i.tfrec'%(seed,j,CT2)) as writer:
                for k in range(CT2):
                    counter = j*SIZE+k
                    img_name = dataset.Image_Name[counter]
                    img = cv2.imread(img_path+img_name+'.jpg')
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                    img = cv2.resize(img,(img_size,img_size),interpolation = cv2.INTER_AREA)
                    img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                    row = dataset.loc[counter]
                    example = serialize_example(
                        str.encode(img_name),
                        img,
                        row['cleaned'],
                        row['Cleaned_Ingredients'],
                        row['Title'],
                        row['Instructions']
                    )
                    writer.write(example)
                    if k%100==0: print(k,', ',end='')

In [12]:
def get_record(df, seed):
    seed_everything(seed)
    df, train_df, val_df = get_dataset(df)
    # save df
    df.to_csv('food_ingredients_dataset.csv', index = False)
    create_recored(train_df,title = "train")
    create_recored(val_df, title = "val")

In [13]:
get_record(df, seed)


Writing TFRecord 0 of 21...
0 , 

  from ipykernel import kernelapp as app


100 , 200 , 300 , 400 , 
Writing TFRecord 1 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 2 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 3 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 4 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 5 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 6 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 7 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 8 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 9 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 10 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 11 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 12 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 13 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 14 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 15 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 16 of 21...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 17 of 21...
0 , 100 , 200 , 300 , 400 ,



100 , 200 , 300 , 400 , 
Writing TFRecord 1 of 7...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 2 of 7...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 3 of 7...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 4 of 7...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 5 of 7...
0 , 100 , 200 , 300 , 400 , 
Writing TFRecord 6 of 7...
0 , 100 , 200 , 300 , 

In [14]:
df.isna().sum()

Title                  0
Instructions           0
Image_Name             0
Cleaned_Ingredients    0
dataset                0
id                     0
cleaned                0
dtype: int64