In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
import json
import re
import nltk
import zipfile

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

from deep_translator import GoogleTranslator

In [2]:
from openai import OpenAI
from dotenv import load_dotenv
from collections import defaultdict

import os
import re
import csv

In [4]:
with open('best_logit_model.pkl', 'rb') as file:
    clf = pickle.load(file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
clf = pickle.load(open("SVC-whats-cooking-trial-final.pickle.dat", "rb"))

In [None]:
# Load API key from .env file
load_dotenv()

key = os.getenv('OPENAI_API_KEY')
if key is None:
    raise ValueError("The OPENAI_API_KEY environment variable is not set \
                     or .env file is missing.")

client = OpenAI(
    api_key=key
)

In [None]:
def call_openai_api(user_prompt, system_prompt, n_runs=1, model="gpt-4-turbo-2024-04-09"):
    responses = []
    for run_number in range(1, n_runs + 1):
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
        )
        response_content = completion.choices[0].message.content
        print(response_content)
        print("========================================next call")
        responses.append(response_content)
    return responses

In [None]:
def load_prompt_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def generate_user_prompt(file_path):
    return load_prompt_from_file(file_path)

def generate_system_prompt(file_path):
    return load_prompt_from_file(file_path)


In [None]:
user_prompt_path = 'user_prompt.txt'
system_prompt_path = 'system_prompt.txt'

In [None]:
user_prompt = generate_user_prompt(user_prompt_path)
system_prompt = generate_system_prompt(system_prompt_path)

In [None]:
responses = call_openai_api(user_prompt, system_prompt)
responses

In [None]:
def process_gpt_responses (gpt_responses):
    # Removing enclosing list and replacing single quotes with double quotes for valid JSON
    cleaned_data = gpt_responses[0].replace("'", '"')
    
    # Replacing newline and tabs if any (for cleaner JSON parsing)
    cleaned_data = cleaned_data.replace("\n", "").replace("\t", "")
    
    # Load the string as JSON
    try:
        recipes = json.loads(cleaned_data)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        recipes = []
    
    # Transforming to the required format
    processed_data = [{'name': recipe['name'], 'ingredients': recipe['ingredients']} for recipe in recipes]
    
    # Printing or saving the processed data
    # print(json.dumps(processed_data, indent=2))
    return processed_data

In [None]:
def preprocess_df(df):
    
    def process_string(x):
        x = [" ".join([WordNetLemmatizer().lemmatize(q) for q in p.split()]) for p in x] #Lemmatization
        x = list(map(lambda x: re.sub(r'\(.*oz.\)|crushed|crumbles|ground|minced|powder|chopped|sliced','', x), x))
        x = list(map(lambda x: re.sub("[^a-zA-Z]", " ", x), x))   # To remove everything except a-z and A-Z
        x = " ".join(x)                                 # To make list element a string element 
        x = x.lower()
        return x
    
    #df = df.drop('id',axis=1)
    df['ingredients'] = df['ingredients'].apply(process_string)
    
    return df

In [None]:
def process_pipeline(gpt_responses, vectorizer, model):
    processed_responses = process_gpt_responses(gpt_responses)
    df = pd.DataFrame(processed_responses)
    test_df = preprocess_df(df)
    test = test_df['ingredients']
    test_transformed = vectorizer.transform(test)
    prediction = model.predict(test_transformed)
    return prediction

In [None]:
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

In [None]:
y_pred = process_pipeline(responses, vectorizer, clf)
y_pred

### Chinese

In [None]:
user_prompt_chn = generate_user_prompt('user_prompt_chn.txt')
system_prompt_chn = generate_system_prompt('system_prompt_chn.txt')

In [None]:
responses_chn = call_openai_api(user_prompt_chn, system_prompt_chn)
responses_chn

In [None]:
def translate_to_eng(responses):
    
    translator = GoogleTranslator(source='auto', target='en')
    translated_texts = []

    for response in responses:
        # Ensure the text does not exceed the 5000 character limit
        if len(response) <= 5000:
            try:
                # Translate the text and convert to lowercase
                translated = translator.translate(response).lower()
                translated_texts.append(translated)
            except Exception as e:
                print(f"Failed to translate text due to: {e}")
                translated_texts.append("Translation failed")
        else:
            print("Text too long to translate:", response)
            translated_texts.append("Text too long and was not translated")

    return translated_texts

In [None]:
translated_responses = translate_to_eng(responses_chn)
translated_responses

In [None]:
y_pred_chn = process_pipeline(translated_responses, vectorizer, clf)
y_pred_chn

### Brazilian (Portuguese)

In [None]:
user_prompt_port = generate_user_prompt('user_prompt_port.txt')
system_prompt_port = generate_system_prompt('system_prompt_port.txt')

In [None]:
responses_port = call_openai_api(user_prompt_port, system_prompt_port)
responses_port

In [None]:
translated_responses_port = translate_to_eng(responses_port)
translated_responses_port

In [None]:
y_pred_port = process_pipeline(translated_responses_port, vectorizer, clf)
y_pred_port

### Batching

In [None]:
def process_gpt_responses(gpt_responses):
    all_processed_data = []
    for response in gpt_responses:
        # Clean and parse each response
        cleaned_data = response.replace("'", '"').replace("\n", "").replace("\t", "")
        try:
            recipes = json.loads(cleaned_data)
            processed_data = [{'name': recipe['name'], 'ingredients': recipe['ingredients']} for recipe in recipes]
            all_processed_data.extend(processed_data)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
    
    return all_processed_data


In [None]:
def process_pipeline(gpt_responses, vectorizer, model):
    # Process all GPT responses and gather all recipes into a single DataFrame
    processed_responses = process_gpt_responses(gpt_responses)
    df = pd.DataFrame(processed_responses)
    test_df = preprocess_df(df)
    test = test_df['ingredients']
    test_transformed = vectorizer.transform(test)
    prediction = model.predict(test_transformed)
    return prediction


In [None]:
def save_responses_to_file(responses, filename="responses.json"):

    with open(filename, "w", encoding='utf-8') as f:
        json.dump(responses, f, ensure_ascii=False, indent=4)

In [None]:
def load_responses_from_file(filename="responses.json"):

    with open(filename, "r", encoding='utf-8') as f:
        return json.load(f)

#### Chinese

In [None]:
responses_chn_2runs = call_openai_api(user_prompt_chn, system_prompt_chn, n_runs=2)

In [None]:
save_responses_to_file(responses_chn_5runs, "responses_chn_2runs.json")

In [None]:
loaded_responses = load_responses_from_file("responses_chn_2runs.json")

In [None]:
translated_responses = translate_to_eng(loaded_responses)
translated_responses

In [None]:
y_pred_chn = process_pipeline(translated_responses, vectorizer, clf)
y_pred_chn