In [None]:
#pip install reportlab
import json
import glob
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import itertools
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix
import pandas as pd
import random

In [None]:
with open('dataset.json', 'r') as file:
    data = json.load(file)

print(data)

In [None]:
len(data)

In [None]:
# look at json
data[0]['cuisines'][0]['label']

In [None]:
# extract images of one restaurant
one_restaurant_info=data[0]['more_details']['full_images']
one_restaurant_images=[i['image_id'] for i in one_restaurant_info]
one_restaurant_images

In [None]:
# do this for all restaurants

res_image_dic={}
cuisine_dic={}
for res in data:
    # use cuisine type as restaurant 
    res_id= res['identifier']
    one_restaurant_info=res['more_details']['full_images']
    one_restaurant_images=[i['image_id'] for i in one_restaurant_info]
    if len(one_restaurant_images)!=0:

    # select randomly 1 images per restaurant, otherwise too many images
        selected=one_restaurant_images[-1]
    res_image_dic[res_id]=selected
    cuisine_dic[res_id] = res['cuisines'][0]['label']#

In [None]:
df_cuisine=pd.DataFrame.from_dict(cuisine_dic, orient='index').reset_index()
df_cuisine.columns=['res_id', 'cuisine']
df_cuisine.groupby('cuisine').count().sort_values(by='res_id',ascending=False).iloc[:22]
# get the country names from top 20

In [None]:
top_22 = list(df_cuisine.groupby('cuisine').count().sort_values(by='res_id',ascending=False).iloc[:22].reset_index()['cuisine'])
top_22 # however, this will give us the top pictures in terms of both food and interior!
# change to top 22 because I want to include amazing indian food :)


In [None]:
country=[ # 8 cuisines
 'Japanese',
 'Italian',
 'French',
 'Mediterranean Cuisine',
 'Modern British',
# 'Modern French',
# 'Cantonese',
 'Thai',
 'Chinese', 
# 'Classic French', 
    'Indian']

In [None]:
extract_relevant_id = list(df_cuisine[df_cuisine['cuisine'].isin(country)]['res_id'])

In [None]:
df_cuisine_selected

In [None]:
df_cuisine_selected=df_cuisine[df_cuisine['res_id'].isin(extract_relevant_id)]

In [None]:
df_cuisine_selected_20=pd.DataFrame()
for i in country:
    selected=df_cuisine_selected[df_cuisine_selected['cuisine']==i].iloc[:20,:]
    df_cuisine_selected_20=pd.concat([df_cuisine_selected_20,selected],axis=0)
    
extract_relevant_id_filter=list(df_cuisine_selected_20.res_id)

In [None]:
res_image_dic=dict((k, res_image_dic[k]) for k in extract_relevant_id_filter if k in res_image_dic)

In [None]:
# list of all restaurant images
nested_list=list(res_image_dic.values())
image_names = nested_list.copy()
#image_names

In [None]:
len(image_names) # okey number of images

In [None]:
# test reading an image
folder_path = 'images'
image_path=folder_path + '/' + image_names[0] + '.jpg'
img=Image.open(image_path)
img

In [None]:
# reading all images in a restaurant setting  (for relevant cuisines)
all_images={}
folder_path = 'images'
target_size = (256, 256) # need to resize because some images do not have the same shap

for name in image_names:
    file_name=name + '.jpg'
    image_path = folder_path + '/' + file_name
    try:
        with Image.open(image_path) as img:
            img = img.resize(target_size)
            image_array = np.array(img)

            all_images[name]=image_array
            #print(name)
    except:
        print(f"Error opening file '{file_name}', skipping...")
        continue

In [None]:
test_image_list=[]

test_image_names=[]
for i in image_names: 
    #plt.imshow(all_images[i], cmap='gray')
    #plt.show()
    if all_images[i].shape == (256, 256, 3):
        test_image_list.append(all_images[i])
        test_image_names.append(i)
    #print(all_images[i].shape) 

In [None]:
test=np.array(test_image_list)
test.shape

# Filter only food images using our model

In [None]:
import keras

In [None]:
food_interior_classifier = keras.models.load_model("food_interior_classifier.h5")
food_interior_classifier

In [None]:
# Predict new images with the model
model_predictions=food_interior_classifier.predict(test)
# set the threshold
threshold = 0.1 # adjust this threshold to make sure what we predict as food is really food
# convert predicted probabilities to labels 0 and 1
predicted_labels = (model_predictions > threshold).astype(int)
# print the predicted labels

In [None]:
# need to find a better threshold, many misclassifications:
# https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/

In [None]:
# adddress class imbalance, we might want to query more images from less prevalent cuisines

In [None]:
label_dict={0: 'food', 1: 'exterior'}
predicted=[]

food_only=[]
food_only_id=[]
for ind, item in enumerate(test): # show images in order to check prediction 
    label=predicted_labels[ind][0]
    predicted.append(label)
    if label==0:
        #print(label)
        food_only.append(item)
        food_only_id.append(test_image_names[ind])
    #print('image ' + str(ind) + ' label:' + (label_dict[label]))
    #plt.imshow(item, cmap='gray')
    #plt.show()

In [None]:
plt.imshow(food_only[-1], cmap='gray')
plt.show()

In [None]:
round(1- sum(predicted)/len(predicted),2)
# percent predicted as food images

In [None]:
len(predicted)

In [None]:
len(food_only)

In [None]:
def save_image(filename):
    p = PdfPages(filename)
    for ind, item in enumerate(food_only): # show images in order to check prediction 
        fig= plt.figure()
        plt.imshow(item, cmap='gray')
        fig.savefig(p, format='pdf') 
        plt.title(ind)
    # close the object
    p.close()  

In [None]:
filename = "cuisine_food_predicted.pdf"  
save_image(filename) 

In [None]:
#mannually check wrong classification (classified food as exterior)
exterior_manual=[6, 18,22,27,29,32,35,38,42,49,50,57,60,61,66,71,79,80,82,83,89,92,99,118]
real_food_index=[i for i in list(range(len(food_only))) if i not in exterior_manual]
len(real_food_index)

In [None]:
real_food_image=[food_only[i] for i in real_food_index]
real_food_id=[food_only_id[i] for i in real_food_index]
for ind, item in enumerate(real_food_image): # show images in order to check prediction 
    plt.imshow(item, cmap='gray')
    plt.show()

In [None]:
final_res=dict((k, res_image_dic[k]) for k in extract_relevant_id_filter if res_image_dic[k] in real_food_id)
final_res_list=list(final_res.keys())

df_cuisine_selected[df_cuisine_selected['res_id'].isin(final_res_list)].groupby('cuisine').count()

In [None]:
final_labels=list(df_cuisine_selected[df_cuisine_selected['res_id'].isin(final_res_list)]['cuisine'])
len(final_labels)

In [None]:
len(real_food_image)

In [None]:
class_cat=pd.DataFrame(final_labels)
class_cat.columns=['label']
dummies = pd.get_dummies(class_cat.label)
dummies.shape

# Modelling

In [None]:
X = np.array(real_food_image)
y = np.array(dummies)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y, random_state=42)

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.applications import VGG16

# Load the pre-trained model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

# Freeze pre-trained layers (compare with no freezing)
for layer in base_model.layers:
    layer.trainable = False

# Define your own classification layers
model = Sequential()
model.add(base_model)
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(8, activation='sigmoid'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on test set
score = model.evaluate(X_test, y_test)
print('Test accuracy:', score[1])

In [None]:
# unlike the previous model, just getting 20 images per cuisine is certainly not enough
# next step: get more images per cuisine