# Lib

In [1]:
import pandas as pd
import csv
import os
from tqdm import tqdm
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
import openai
import re

# Function

In [2]:
def prepare_file(file_path, file_header):
    try:
        if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
            with open(file_path, 'w', newline='') as file:
                file.write(file_header)
        print(f"Crawler - {file_path} - File path checking completed.")
    except Exception as e:
        print(f"Crawler - File path error..!! - {file_header} - {e}")

In [3]:
write_lock = multiprocessing.Lock()
def writeMainInfo(result_entry: dict, file_name: str) -> None:
    try:
        with write_lock:
            with open(file_name, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(f=file, quoting=csv.QUOTE_MINIMAL, fieldnames=result_entry.keys())
                writer.writerow(result_entry)
    except Exception as e:
        print(f'Crawler - FAILED at "writeMainInfo()" - {e}.')

In [4]:
def collectRecordsTobeChecked(success_path: str,df_source: str, header: str) -> pd.DataFrame:
    try:
        prepare_file(success_path, header)
        # df_source = pd.read_csv(source_path,on_bad_lines="skip",engine='python', quotechar='"')
        df_success = pd.read_csv(success_path,on_bad_lines="skip",engine='python', quotechar='"') if os.path.exists(success_path) else pd.DataFrame(columns=['code'])

        # processing_df = df_source[len(df_success)+1::]
        processing_df = df_source[~df_source['code'].isin(set(df_success['code']))]

        print(f"Total batch records: {len(df_source)} records, "
                        f"Skipped: {len(df_source)-len(processing_df)} records, "
                        f"Ordered to be processed: {len(processing_df)}")
        return processing_df

    except Exception as e:
        print(f"Crawler - FAILED at 'collectRecordsTobeChecked()' - {e}.")
        return pd.DataFrame()

In [5]:
def promptGPT(prompt, api_key, base_url):
    client = openai.OpenAI(api_key= api_key, base_url = base_url)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        max_tokens=500,
        temperature=0.7,  # More creative, so more detail is generated
        top_p=0.9,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        n=1
    )
    return response

# Content

In [6]:
def create_prompt_main(review, name, address, cate_lst): 
    promt = f"""
    You are tasked with creating content and classifying data for the specified BRAND using the provided customer reviews. Follow these instructions to complete both tasks:
    First task: Write an engaging 'Overview' page introduction for the BRAND based on the reviews.
    Overview Page Content:

    Objective: Write a captivating 'Overview' page introduction that showcases the BRAND’s strengths, features, and overall appeal.
    Content Focus: Highlight the BRAND's visual appeal, key features, and unique attributes based on customer reviews. Exclude any references to guests or their experiences.
    SEO Optimization: Naturally integrate relevant keywords to enhance search engine visibility while ensuring the content is engaging and easy to read.
    Tone and Style: Use a friendly, professional, and descriptive tone. Write in a neutral, third-person perspective, avoiding pronouns like "we," "our," "us," "they," "their," or "them."
    Special Note: If the BRAND name is not in English, use its original form.
    Format: Provide only the text of the introduction without a title.

    Content paragraph must be about 240 words. WRITE IT LONGER AND MORE DETAILED
    STRICTLY FOLLOW Length of content : *** The CONTENT MUST BE ABOUT 240 WORDS. ABOUT 330 TOKEN. *** .
    
    Second task: Classify the BRAND into the most suitable category and summarize its advantages and disadvantages based on the reviews.
    Brand Classification and Summary:

    Objective: Assign a suitable category to the BRAND and summarize its advantages and disadvantages based on customer reviews.
    Brand Categorization: Select the most fitting category from the provided list of categories.
    Summary Requirements: Write two separate summaries:
    Advantages: Provide a two-sentence summary, with each sentence up to 30 words, detailing the BRAND’s positive aspects.
    Disadvantages: Provide a two-sentence summary, with each sentence up to 30 words, detailing the BRAND’s negative aspects.
    SEO Optimization: Ensure summaries adhere to SEO best practices for clarity and relevance.
    Tone and Style: Maintain a neutral, informative, and objective tone in a third-person narrative, avoiding personal pronouns.
    Format:
    Brand Category: [Appropriate category]
    Brand Advantage: [First advantage sentence]####[Second advantage sentence]
    Brand Disadvantage: [First disadvantage sentence]####[Second disadvantage sentence]
    Length: Keep each summary approximately 60-80 tokens.
    Inputs:

    BRAND: {name}
    BRAND_ADDRESS: {address}
    Reviews: {review}
    List of Categories: {cate_lst}
    Ideal Output Format:

    Overview Page Content: [Overview Page Content Output]
    Brand Category: [Appropriate category]
    Brand Advantage: [First advantage sentence]####[Second advantage sentence]
    Brand Disadvantage: [First disadvantage sentence]####[Second disadvantage sentence]
    Ensure the content and classifications accurately reflect the BRAND as described in the reviews, delivering a detailed overview and precise categorization."""
    return promt

In [8]:
def extract_response(response):
    content, category, advantage, disadvantage = None, None, None, None

    text = response['choices'][0].message.content

    content = text.split('Brand Category:')[0].strip()
    content = content.replace('Overview Page Content:', '').strip()
    content = content.replace('\n', ' ')

    category_match = re.search(r'Brand Category:\s*(.*)\s*', text)
    if category_match:
        category = category_match.group(1).strip()
    else:
        category = text.split('Brand Advantage:')[0].strip()
        category = text.split('Brand Category:')[1].split('Brand Advantage:')[0].strip()
        category = category.replace('Overview Page Content:', '').strip()
        category = category.replace('*', '').strip()

    advantage = text.split('Brand Advantage:')[1].split('Brand Disadvantage:')[0].strip()
    advantage = [x.strip() for x in advantage.split('####') if x.strip()]

    disadvantage = text.split('Brand Disadvantage:')[1].strip()
    disadvantage = [x.strip() for x in disadvantage.split('####') if x.strip()]

    
    return content, category, advantage, disadvantage

In [9]:
catefile = r"D:\Wheree_Kiet\Input\category.csv"
cate_df = pd.read_csv(catefile)
cate_lst = cate_df['category'].tolist()

In [10]:
api_key = 'YOUR_API_KEY'
base_url="https://open.keyai.shop/v1"

In [25]:
source_path = r"D:\DATA\2024\Sep\Combine\10_10_2024\grouped_reviews.csv"
base_dir = r"D:\DATA\2024\Sep\Combine\10_10_2024"
success_path = os.path.join(base_dir, 'main_content.csv')
header = 'code,content,type,category,input_token,output_token\n'

pros_cons = os.path.join(base_dir, 'pros_cons.csv')
header_pros_cons = 'code,advantages,disadvantages\n'
prepare_file(pros_cons, header_pros_cons)


main_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main.csv")
map_name = dict(zip(main_df['code'], main_df['orignalname']))
map_address = dict(zip(main_df['code'], main_df['fulladdress']))
df = pd.read_csv(source_path)


df =  collectRecordsTobeChecked(success_path, df, header)
df['name'] = df['code'].map(map_name)
df['address'] = df['code'].map(map_address)
max_word  = 500

Crawler - D:\DATA\2024\Sep\Combine\10_10_2024\pros_cons.csv - File path checking completed.


  main_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main.csv")


Crawler - D:\DATA\2024\Sep\Combine\10_10_2024\main_content.csv - File path checking completed.
Total batch records: 221103 records, Skipped: 221103 records, Ordered to be processed: 0


In [13]:
def process_row(row):

    review = row['content'] if 'content' in row else (row['reviews'] if 'reviews' in row else row['review'])
    
    #review = " ".join(review.split(' ')[:max_word])
    
    name = row['name']
    address = row['address']
    

    prompt = create_prompt_main(review, name, address, cate_lst)
    

    response = promptGPT(prompt, api_key, base_url)
    response = dict(response)
    
    if response['choices'][0].message.content:
        content, category, advantage, disadvantage = extract_response(response)

        if len(content.split(' ')) > 100 and category in cate_lst and len(advantage) == len(disadvantage) == 2:
            # Write main info
            result_entry_main = {
                'code': row['code'],
                'content': content,
                'type': 'HOME.S1.P1.C1',
                'category': category,
                'input_token': response['usage'].prompt_tokens,
                'output_token': response['usage'].completion_tokens
            }
            writeMainInfo(result_entry_main, success_path)
            
            for i in range(len(advantage)):
                result_entry_pros_cons = {
                    'code': row['code'],
                    'advantages': advantage[i],
                    'advantages': disadvantage[i]
                }
                writeMainInfo(result_entry_pros_cons, pros_cons)

In [21]:
with ProcessPoolExecutor(max_workers=30) as executor:  
    futures = [executor.submit(process_row, row) for _, row in df.iterrows()]
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing rows"):
        try:
            future.result() 
        except Exception as e:
            # print(f"Error processing row")
            pass

Processing rows: 100%|██████████| 1/1 [00:04<00:00,  4.62s/it]
