# Lib

In [1]:
import pandas as pd
import csv
import os
import re
from tqdm import tqdm
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
import openai

# Function

In [2]:
def prepare_file(file_path, file_header):
    try:
        if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
            with open(file_path, 'w', newline='') as file:
                file.write(file_header)
        print(f"Crawler - {file_path} - File path checking completed.")
    except Exception as e:
        print(f"Crawler - File path error..!! - {file_header} - {e}")

In [3]:
write_lock = multiprocessing.Lock()
def writeMainInfo(result_entry: dict, file_name: str) -> None:
    try:
        with write_lock:
            with open(file_name, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(f=file, quoting=csv.QUOTE_MINIMAL, fieldnames=result_entry.keys())
                writer.writerow(result_entry)
    except Exception as e:
        print(f'Crawler - FAILED at "writeMainInfo()" - {e}.')

In [4]:
def collectRecordsTobeChecked(success_path: str, df_source: str, header: str) -> pd.DataFrame:
    try:
        prepare_file(success_path, header)
        # df_source = pd.read_csv(source_path,on_bad_lines="skip",engine='python', quotechar='"')
        df_success = pd.read_csv(success_path,on_bad_lines="skip",engine='python', quotechar='"') if os.path.exists(success_path) else pd.DataFrame(columns=['code'])
        df_success = df_success[df_success['type'] =='MENU.S34.P1.C1']
        # processing_df = df_source[len(df_success)+1::]
        processing_df = df_source[~df_source['code'].isin(set(df_success['code']))]

        print(f"Total batch records: {len(df_source)} records, "
                        f"Skipped: {len(df_source)-len(processing_df)} records, "
                        f"Ordered to be processed: {len(processing_df)}")

        return processing_df

    except Exception as e:
        print(f"Crawler - FAILED at 'collectRecordsTobeChecked()' - {e}.")
        return pd.DataFrame()

In [5]:
def promptGPT(prompt, api_key, base_url):
    client = openai.OpenAI(
    api_key= api_key,
    base_url= base_url
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        max_tokens=300,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        n=1,   
    )
    return response


# Content

In [6]:
def prompt_menu(review, name, address):
    prompt = f"""
    You are an expert in SEO content writing with a focus on creating engaging and informative content for websites. 
    Your task is to write a concise and compelling content about 'menu photo' page description for the given BRAND, using customer reviews as your primary source of information.

    Instructions:

    Content Overview: Introduce about 180 words to introdruce people about the menu and food, drink of the brand. Do not include any negative information. Do not write about anything else that the brand dont have. 
    Avoiding Negative side of reviews and focus only on the menu, Food, drink. Dont write about anything else
    SEO Compliance: Follow SEO best practices to ensure the content is optimized for search engines. Follow the best practices for writing page content.
    Length: Write between 180 to 200 words.
    Language Use: Avoid using the words "we", "our", "us", "they", "their", and "them" to maintain a neutral tone. Write as third-person narrative, who is an expert in SEO content writing.
    Must follow the instructions and write the content as per the guidelines provided. Dont return the intro like "**Menu Page Description for El Colomaniano**".

    The BRAND name and customer reviews will be provided below:

    BRAND: {name}
    BRAND_ADDRESS: {address}
    Reviews: {review}
    Generate content that captures the essence of the BRAND as described in the reviews, emphasizing key highlights and unique aspects. If BRAND has another language format follwoing another language (english name)"""
    return prompt

In [7]:
def process_row(index_row):
    index, row = index_row
    review = row['content'] if 'content' in row else (row['reviews'] if 'reviews' in row else row['review'])

    

    review = " ".join(review.split(' ')[:min(max_word, len(review.split(' ')))])
    name = row['name']
    address = row['address']
    
    prompt = prompt_menu(review, name, address)
    content_type = 'MENU.S34.P1.C1'
    response = promptGPT(prompt, api_key, base_url)
    result_dict = dict(response)
    choices= result_dict['choices']
    content = choices[0].message.content
    
    # content = promptGemini(prompt, api_keys)
    if content is not None:
        content = content.replace('\n', '')
        result = {
            'code': row['code'],
            'content': content,
            'type': content_type,
            'input_token': result_dict['usage'].prompt_tokens,
            'output_token': result_dict['usage'].completion_tokens
        }
        writeMainInfo(result, success_path)
    else:
        return False

In [8]:
api_key = 'YOUR_API_KEY'
base_url="https://open.keyai.shop/v1"

In [14]:
source_path = r"D:\DATA\2024\Sep\Combine\10_10_2024\grouped_reviews.csv"
success_path = r"D:\DATA\2024\Sep\Combine\10_10_2024\menu_content.csv"
header = 'code,content,type,input_token,output_token\n'


main_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main.csv")
img_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\img.csv")
img_df = img_df[img_df['type'] == 'MENU']
img_df.drop_duplicates(subset=['code'], keep='first', inplace=True)
map_name = dict(zip(main_df['code'], main_df['orignalname']))
map_address = dict(zip(main_df['code'], main_df['fulladdress']))

df = pd.read_csv(source_path)
df.drop_duplicates(subset=['code'], keep='first', inplace=True)
df = df[df['code'].isin(img_df['code'])]

# df.drop_duplicates(subset=['code'], keep='first', inplace=True)
df =  collectRecordsTobeChecked(success_path, df, header)
df['name'] = df['code'].map(map_name)
df['address'] = df['code'].map(map_address)
max_word = 500

  main_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main.csv")


Crawler - D:\DATA\2024\Sep\Combine\10_10_2024\menu_content.csv - File path checking completed.
Total batch records: 34647 records, Skipped: 34647 records, Ordered to be processed: 0


In [13]:
with ProcessPoolExecutor(max_workers=1) as executor:
    futures = [executor.submit(process_row, index_row) for index_row in df.iterrows()]

    # Use tqdm with as_completed for progress tracking
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Reviews"):
        try:
            result = future.result()  # Get the result from the future, or handle it if needed
        except Exception as e:
            #print(f"Error: {e}")
            pass

Processing Reviews: 100%|██████████| 9/9 [00:25<00:00,  2.85s/it]
