In [16]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
import json
import re
import nltk
import zipfile

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

from deep_translator import GoogleTranslator

In [4]:
from openai import OpenAI
from dotenv import load_dotenv
from collections import defaultdict

import os
import re
import csv

In [5]:
clf = pickle.load(open("SVC-whats-cooking-trial-final.pickle.dat", "rb"))

In [6]:
# Load API key from .env file
load_dotenv()

key = os.getenv('OPENAI_API_KEY')
if key is None:
    raise ValueError("The OPENAI_API_KEY environment variable is not set \
                     or .env file is missing.")

client = OpenAI(
    api_key=key
)

In [50]:
def call_openai_api(user_prompt, system_prompt, n_runs=1, model="gpt-4-turbo-2024-04-09"):
    responses = []
    for run_number in range(1, n_runs + 1):
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
        )
        response_content = completion.choices[0].message.content
        print(response_content)
        print("========================================next call")
        responses.append(response_content)
    return responses

In [8]:
def load_prompt_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def generate_user_prompt(file_path):
    return load_prompt_from_file(file_path)

def generate_system_prompt(file_path):
    return load_prompt_from_file(file_path)


In [7]:
user_prompt_path = 'user_prompt.txt'
system_prompt_path = 'system_prompt.txt'

In [8]:
user_prompt = generate_user_prompt(user_prompt_path)
system_prompt = generate_system_prompt(system_prompt_path)

In [9]:
responses = call_openai_api(user_prompt, system_prompt)
responses

["[{'name': 'quinoa salad',\n  'ingredients': ['quinoa', 'cucumbers', 'tomatoes', 'red onion', 'parsley', 'lemon juice', 'olive oil', 'feta cheese']},\n {'name': 'kale smoothie',\n  'ingredients': ['kale', 'banana', 'apple', 'chia seeds', 'almond milk', 'honey']},\n {'name': 'chickpea curry',\n  'ingredients': ['chickpeas', 'coconut milk', 'onion', 'tomatoes', 'garlic', 'ginger', 'curry powder', 'spinach']},\n {'name': 'vegetable stir-fry',\n  'ingredients': ['broccoli', 'bell peppers', 'carrots', 'snap peas', 'soy sauce', 'tofu', 'sesame oil', 'garlic']},\n {'name': 'avocado toast',\n  'ingredients': ['whole grain bread', 'avocado', 'lemon juice', 'salt', 'pepper', 'radishes']},\n {'name': 'lentil soup',\n  'ingredients': ['lentils', 'carrots', 'celery', 'onion', 'tomato paste', 'garlic', 'vegetable broth', 'spinach']},\n {'name': 'sweet potato tacos',\n  'ingredients': ['sweet potatoes', 'black beans', 'corn tortillas', 'avocado', 'lime', 'cilantro']},\n {'name': 'overnight oats',\n 

In [33]:
def process_gpt_responses (gpt_responses):
    # Removing enclosing list and replacing single quotes with double quotes for valid JSON
    cleaned_data = gpt_responses[0].replace("'", '"')
    
    # Replacing newline and tabs if any (for cleaner JSON parsing)
    cleaned_data = cleaned_data.replace("\n", "").replace("\t", "")
    
    # Load the string as JSON
    try:
        recipes = json.loads(cleaned_data)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        recipes = []
    
    # Transforming to the required format
    processed_data = [{'name': recipe['name'], 'ingredients': recipe['ingredients']} for recipe in recipes]
    
    # Printing or saving the processed data
    # print(json.dumps(processed_data, indent=2))
    return processed_data

In [10]:
def preprocess_df(df):
    
    def process_string(x):
        x = [" ".join([WordNetLemmatizer().lemmatize(q) for q in p.split()]) for p in x] #Lemmatization
        x = list(map(lambda x: re.sub(r'\(.*oz.\)|crushed|crumbles|ground|minced|powder|chopped|sliced','', x), x))
        x = list(map(lambda x: re.sub("[^a-zA-Z]", " ", x), x))   # To remove everything except a-z and A-Z
        x = " ".join(x)                                 # To make list element a string element 
        x = x.lower()
        return x
    
    #df = df.drop('id',axis=1)
    df['ingredients'] = df['ingredients'].apply(process_string)
    
    return df

In [34]:
def process_pipeline(gpt_responses, vectorizer, model):
    processed_responses = process_gpt_responses(gpt_responses)
    df = pd.DataFrame(processed_responses)
    test_df = preprocess_df(df)
    test = test_df['ingredients']
    test_transformed = vectorizer.transform(test)
    prediction = model.predict(test_transformed)
    return prediction

In [12]:
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

In [15]:
y_pred = process_pipeline(responses, vectorizer, clf)
y_pred

[
  {
    "name": "quinoa salad",
    "ingredients": [
      "quinoa",
      "cucumbers",
      "tomatoes",
      "red onion",
      "parsley",
      "lemon juice",
      "olive oil",
      "feta cheese"
    ]
  },
  {
    "name": "kale smoothie",
    "ingredients": [
      "kale",
      "banana",
      "apple",
      "chia seeds",
      "almond milk",
      "honey"
    ]
  },
  {
    "name": "chickpea curry",
    "ingredients": [
      "chickpeas",
      "coconut milk",
      "onion",
      "tomatoes",
      "garlic",
      "ginger",
      "curry powder",
      "spinach"
    ]
  },
  {
    "name": "vegetable stir-fry",
    "ingredients": [
      "broccoli",
      "bell peppers",
      "carrots",
      "snap peas",
      "soy sauce",
      "tofu",
      "sesame oil",
      "garlic"
    ]
  },
  {
    "name": "avocado toast",
    "ingredients": [
      "whole grain bread",
      "avocado",
      "lemon juice",
      "salt",
      "pepper",
      "radishes"
    ]
  },
  {
    "name": "le

array(['greek', 'brazilian', 'indian', 'chinese', 'mexican', 'italian',
       'mexican', 'british', 'chinese', 'russian'], dtype=object)

### Chinese

In [13]:
user_prompt_chn = generate_user_prompt('user_prompt_chn.txt')
system_prompt_chn = generate_system_prompt('system_prompt_chn.txt')

In [22]:
responses_chn = call_openai_api(user_prompt_chn, system_prompt_chn)
responses_chn

["[{'名字': '藜麦色拉',\n  '成分': ['藜麦', '樱桃番茄', '黄瓜', '红洋葱', '橄榄油', '柠檬汁', '牛至', '盐']},\n {'名字': '鳄梨玉米汤',\n  '成分': ['鳄梨', '玉米', '鸡肉汤', '青椒', '洋葱', '蒜', '香菜', '辣椒粉', '莳萝']},\n {'名字': '素食豆腐炒饭',\n  '成分': ['豆腐', '米饭', '胡萝卜', '青豆', '酱油', '芝麻油', '大蒜', '生姜']},\n {'名字': '地中海沙拉',\n  '成分': ['番茄', '黄瓜', '红洋葱', '黑橄榄', '羊奶酪', '橄榄油', '醋', '牛至']},\n {'名字': '南瓜汤',\n  '成分': ['南瓜', '洋葱', '鸡肉汤', '忌廉', '蜂蜜', '肉豆蔻', '黄油', '盐', '胡椒']},\n {'名字': '鲑鱼蔬菜包',\n  '成分': ['鲑鱼', '胡萝卜', '大白菜', '生姜', '蒜', '香油', '酱油', '白酒', '糖']},\n {'名字': '红菜头蔬菜汁',\n  '成分': ['红菜头', '苹果', '胡萝卜', '柠檬', '姜']},\n {'名字': '番薯椰奶饭',\n  '成分': ['番薯', '椰奶', '茉莉香米', '椰子糖', '盐']},\n {'名字': '菠菜豆腐汤',\n  '成分': ['豆腐', '菠菜', '蘑菇', '清汤', '盐', '蒜', '香油']},\n {'名字': '烤蔬菜与鹰嘴豆',\n  '成分': ['红椒', '黄椒', '紫洋葱', '鹰嘴豆', '橄榄油', '百里香', '盐', '胡椒']}]"]

In [27]:
def translate_to_eng(responses):
    
    translator = GoogleTranslator(source='auto', target='en')
    translated_texts = []

    for response in responses:
        # Ensure the text does not exceed the 5000 character limit
        if len(response) <= 5000:
            try:
                # Translate the text and convert to lowercase
                translated = translator.translate(response).lower()
                translated_texts.append(translated)
            except Exception as e:
                print(f"Failed to translate text due to: {e}")
                translated_texts.append("Translation failed")
        else:
            print("Text too long to translate:", response)
            translated_texts.append("Text too long and was not translated")

    return translated_texts

In [28]:
translated_responses = translate_to_eng(responses_chn)
translated_responses

["[{'name': 'quinoa salad',\n  'ingredients': ['quinoa', 'cherry tomatoes', 'cucumber', 'red onion', 'olive oil', 'lemon juice', 'oregano', 'salt']},\n {'name': 'avocado corn soup',\n  'ingredients': ['avocado', 'corn', 'chicken soup', 'green pepper', 'onion', 'garlic', 'cilantro', 'chili powder', 'dill']},\n {'name': 'vegetarian tofu fried rice',\n  'ingredients': ['tofu', 'rice', 'carrot', 'green beans', 'soy sauce', 'sesame oil', 'garlic', 'ginger']},\n {'name': 'mediterranean salad',\n  'ingredients': ['tomato', 'cucumber', 'red onion', 'black olives', 'feta cheese', 'olive oil', 'vinegar', 'oregano']},\n {'name': 'pumpkin soup',\n  'ingredients': ['pumpkin', 'onion', 'chicken soup', 'cream', 'honey', 'nutmeg', 'butter', 'salt', 'pepper']},\n {'name': 'salmon vegetable bun',\n  'ingredients': ['salmon', 'carrot', 'chinese cabbage', 'ginger', 'garlic', 'sesame oil', 'soy sauce', 'liquor', 'sugar']},\n {'name': 'beetroot vegetable juice',\n  'ingredients': ['beetroot', 'apple', 'carr

In [35]:
y_pred_chn = process_pipeline(translated_responses, vectorizer, clf)
y_pred_chn

array(['greek', 'mexican', 'chinese', 'greek', 'southern_us', 'chinese',
       'french', 'filipino', 'japanese', 'southern_us'], dtype=object)

### Brazilian (Portuguese)

In [36]:
user_prompt_port = generate_user_prompt('user_prompt_port.txt')
system_prompt_port = generate_system_prompt('system_prompt_port.txt')

In [37]:
responses_port = call_openai_api(user_prompt_port, system_prompt_port)
responses_port

["[{'nome': 'salada grega',\n  'ingredientes': ['tomate',\n   'pepino',\n   'cebola roxa',\n   'queijo feta',\n   'azeitonas kalamata',\n   'azeite de oliva',\n   'vinagre de vinho tinto',\n   'orégano']},\n {'nome': 'smoothie de morango e banana',\n  'ingredientes': ['morangos',\n   'banana',\n   'iogurte grego',\n   'mel',\n   'leite de amêndoa']},\n {'nome': 'wraps de alface com frango',\n  'ingredientes': ['peito de frango',\n   'alface romana',\n   'cenoura',\n   'pepino',\n   'pimentão',\n   'molho hoisin']},\n {'nome': 'sopa de lentilha',\n  'ingredientes': ['lentilhas',\n   'cenoura',\n   'cebola',\n   'tomate',\n   'alho',\n   'caldo de legumes',\n   'cominho',\n   'páprica',\n   'azeite de oliva']},\n {'nome': 'quinoa com legumes',\n  'ingredientes': ['quinoa',\n   'pimentão',\n   'abobrinha',\n   'cebola roxa',\n   'alho',\n   'caldo de legumes',\n   'salsinha']},\n {'nome': 'salada de quinoa com abacate',\n  'ingredientes': ['quinoa',\n   'abacate',\n   'tomate',\n   'milho

In [38]:
translated_responses_port = translate_to_eng(responses_port)
translated_responses_port

["[{'name': 'greek salad',\n  'ingredients': ['tomato',\n   'cucumber',\n   'purple onion',\n   'feta cheese',\n   'kalamata olives',\n   'olive oil',\n   'red wine vinegar',\n   'oregano']},\n {'name': 'strawberry banana smoothie',\n  'ingredients': ['strawberries',\n   'banana',\n   'greek yogurt',\n   'honey',\n   'almond milk']},\n {'name': 'chicken lettuce wraps',\n  'ingredients': ['chicken breast',\n   'roman lettuce',\n   'carrot',\n   'cucumber',\n   'pepper',\n   'hoisin sauce']},\n {'name': 'lentil soup',\n  'ingredients': ['lentils',\n   'carrot',\n   'onion',\n   'tomato',\n   'garlic',\n   'vegetable broth',\n   'cumin',\n   'paprika',\n   'olive oil']},\n {'name': 'quinoa with vegetables',\n  'ingredients': ['quinoa',\n   'pepper',\n   'zucchini',\n   'purple onion',\n   'garlic',\n   'vegetable broth',\n   'parsley']},\n {'name': 'avocado quinoa salad',\n  'ingredients': ['quinoa',\n   'avocado',\n   'tomato',\n   'corn',\n   'black bean',\n   'coriander',\n   'lemon',\

In [40]:
y_pred_port = process_pipeline(translated_responses_port, vectorizer, clf)
y_pred_port

array(['greek', 'greek', 'chinese', 'moroccan', 'italian', 'mexican',
       'french', 'greek', 'southern_us', 'mexican'], dtype=object)

### Batching

In [46]:
def process_gpt_responses(gpt_responses):
    all_processed_data = []
    for response in gpt_responses:
        # Clean and parse each response
        cleaned_data = response.replace("'", '"').replace("\n", "").replace("\t", "")
        try:
            recipes = json.loads(cleaned_data)
            processed_data = [{'name': recipe['name'], 'ingredients': recipe['ingredients']} for recipe in recipes]
            all_processed_data.extend(processed_data)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
    
    return all_processed_data


In [43]:
def process_pipeline(gpt_responses, vectorizer, model):
    # Process all GPT responses and gather all recipes into a single DataFrame
    processed_responses = process_gpt_responses(gpt_responses)
    df = pd.DataFrame(processed_responses)
    test_df = preprocess_df(df)
    test = test_df['ingredients']
    test_transformed = vectorizer.transform(test)
    prediction = model.predict(test_transformed)
    return prediction


In [58]:
def save_responses_to_file(responses, filename="responses.json"):

    with open(filename, "w", encoding='utf-8') as f:
        json.dump(responses, f, ensure_ascii=False, indent=4)

In [59]:
def load_responses_from_file(filename="responses.json"):

    with open(filename, "r", encoding='utf-8') as f:
        return json.load(f)

#### Chinese

In [51]:
responses_chn_2runs = call_openai_api(user_prompt_chn, system_prompt_chn, n_runs=2)

[{'名字': '烤鲑鱼与蔬菜',
  '成分': ['鲑鱼', '橄榄油', '盐', '黑胡椒', '柠檬汁', '西蓝花', '胡萝卜', '椰菜花']},
 {'名字': '番茄基诺亚沙拉',
  '成分': ['基诺亚', '番茄', '黄瓜', '红洋葱', '橄榄油', '柠檬汁', '薄荷', '含盐']},
 {'名字': '甜菜根和羊奶酪沙拉',
  '成分': ['甜菜根', '羊奶酪', '菠菜', '核桃', '橄榄油', '苹果醋']},
 {'名字': '鸡胸肉烤胚芽',
  '成分': ['鸡胸肉', '麦胚芽', '椰菜', '橄榄油', '盐', '黑胡椒']},
 {'名字': '蒸豆腐与蘑菇',
  '成分': ['豆腐', '新鲜蘑菇', '葱', '酱油', '芝麻油', '姜']},
 {'名字': '土豆泥',
  '成分': ['土豆', '橄榄油', '蒜', '盐', '黑胡椒', '牛奶']},
 {'名字': '素食黄瓜卷',
  '成分': ['黄瓜', '胡萝卜', '甜椒', '紫甘蓝', '鳄梨', '姜汁']},
 {'名字': '西班牙番茄面',
  '成分': ['西红柿', '全麦意大利面', '大蒜', '橄榄油', '罗勒', '帕马森芝士']},
 {'名字': '奇亚籽布丁',
  '成分': ['奇亚籽', '椰奶', '枫糖浆', '香草精', '混合浆果']},
 {'名字': '绿色蔬菜汤',
  '成分': ['菠菜', '西蓝花', '洋葱', '大蒜', '鸡肉汤', '椰奶', '盐', '黑胡椒']}]

[{'名字': '鳄梨鸡肉沙拉',
  '成分': ['鳄梨', '熟鸡胸肉', '橄榄油', '柠檬汁', '罗勒叶', '松子', '樱桃番茄', '黑橄榄', '盐', '胡椒粉']},
 {'名字': '藜麦甜菜根汉堡',
  '成分': ['藜麦', '甜菜根', '全麦面包屑', '鸡蛋', '洋葱', '大蒜', '莳萝', '盐', '黑胡椒粉', '油']},
 {'名字': '土豆泥燕麦粥',
  '成分': ['土豆', '燕麦', '牛奶', '盐', '黑胡椒粉', '黄油', '香葱']},
 {'名字': '西葫芦意面',
  '成分'

In [60]:
save_responses_to_file(responses_chn_5runs, "responses_chn_2runs.json")

In [64]:
loaded_responses = load_responses_from_file("responses_chn_2runs.json")

In [65]:
translated_responses = translate_to_eng(loaded_responses)
translated_responses

["[{'name': 'grilled salmon and vegetables',\n  'ingredients': ['salmon', 'olive oil', 'lemon juice', 'garlic', 'salt', 'black pepper', 'broccoli', 'red pepper', 'purple onion']},\n {'name': 'beetroot salad',\n  'ingredients': ['beetroot', 'mixed leafy greens', 'feta cheese', 'walnuts', 'olive oil', 'red wine vinegar', 'honey', 'salt', 'black pepper']},\n {'name': 'quinoa and black bean soup',\n  'ingredients': ['quinoa', 'black beans', 'tomato', 'onion', 'garlic', 'vegetable stock', 'paprika', 'cumin', 'coriander']},\n {'name': 'chicken breast stir-fried with vegetables',\n  'ingredients': ['chicken breast', 'garlic', 'broccoli', 'carrot', 'red pepper', 'olive oil', 'soy sauce', 'ginger', 'black pepper']},\n {'name': 'homemade yogurt',\n  'ingredients': ['milk', 'active yogurt bacteria']},\n {'name': 'mung bean salad',\n  'ingredients': ['mung bean', 'cucumber', 'tomato', 'onion', 'olive oil', 'apple cider vinegar', 'honey', 'mustard', 'salt', 'black pepper']},\n {'name': 'oatmeal ban

In [66]:
y_pred_chn = process_pipeline(translated_responses, vectorizer, clf)
y_pred_chn

Error decoding JSON: Expecting ',' delimiter: line 1 column 1161 (char 1160)


array(['italian', 'greek', 'mexican', 'chinese', 'indian', 'southern_us',
       'southern_us', 'indian', 'italian', 'mexican', 'greek', 'jamaican',
       'mexican', 'italian', 'greek', 'italian', 'indian', 'chinese',
       'italian', 'greek', 'greek', 'korean', 'greek', 'italian',
       'mexican', 'chinese', 'mexican', 'italian', 'chinese', 'mexican',
       'greek', 'brazilian', 'japanese', 'mexican', 'italian', 'mexican',
       'greek', 'southern_us', 'chinese', 'japanese'], dtype=object)