# Lib

In [33]:
import pandas as pd
import csv
import os
from tqdm import tqdm
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
import openai
import re

# Function

In [34]:
def prepare_file(file_path, file_header):
    try:
        if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
            with open(file_path, 'w', newline='') as file:
                file.write(file_header)
        print(f"Crawler - {file_path} - File path checking completed.")
    except Exception as e:
        print(f"Crawler - File path error..!! - {file_header} - {e}")

In [35]:
write_lock = multiprocessing.Lock()
def writeMainInfo(result_entry: dict, file_name: str) -> None:
    try:
        with write_lock:
            with open(file_name, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(f=file, quoting=csv.QUOTE_MINIMAL, fieldnames=result_entry.keys())
                writer.writerow(result_entry)
    except Exception as e:
        print(f'Crawler - FAILED at "writeMainInfo()" - {e}.')

In [36]:
def collectRecordsTobeChecked(success_path: str,df_source: str, header: str) -> pd.DataFrame:
    try:
        prepare_file(success_path, header)
        # df_source = pd.read_csv(source_path,on_bad_lines="skip",engine='python', quotechar='"')
        df_success = pd.read_csv(success_path,on_bad_lines="skip",engine='python', quotechar='"') if os.path.exists(success_path) else pd.DataFrame(columns=['code'])

        # processing_df = df_source[len(df_success)+1::]
        processing_df = df_source[~df_source['code'].isin(set(df_success['code']))]

        print(f"Total batch records: {len(df_source)} records, "
                        f"Skipped: {len(df_source)-len(processing_df)} records, "
                        f"Ordered to be processed: {len(processing_df)}")
        return processing_df

    except Exception as e:
        print(f"Crawler - FAILED at 'collectRecordsTobeChecked()' - {e}.")
        return pd.DataFrame()

In [37]:
def promptGPT(prompt, api_key, base_url):
    client = openai.OpenAI(api_key= api_key, base_url = base_url)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        max_tokens=350,
        temperature=0.7,  
        top_p=0.9,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        n=1
    )

    return response


# Content

In [38]:
def create_prompt_photo(review, name, address, subcate_lst, criteria_lst): 
    promt = f"""
    You are tasked with creating content and classifying data for the specified BRAND using the provided customer reviews. Follow these instructions to complete THREE tasks:
    

    1.  First task: Write an engaging 'Photo' page introduction for the BRAND based on the reviews.

        You are an expert in SEO content writing with a focus on creating engaging and informative content for websites. 
        First task is to write a concise and compelling content about 'photo' page description for the given BRAND, using customer reviews as your primary source of information.

        Instructions:

        Content Photo: Introduce the BRAND photo and views, the landscape around the BRAND, focusing on the positive aspects. Do not include any negative information. Do not write about anything else that the brand dont have.Focus only on positive side of photos and Views, Landscape around, Dont write about anything else.
        SEO Compliance: Follow SEO best practices to ensure the content is optimized for search engines. Follow the best practices for writing page content.
        Length: Write between 180 to 230 words.
        Language Use: Avoid using the words "we", "our", "us", "they", "their", and "them" to maintain a neutral tone. Write as third-person narrative, who is an expert in SEO content writing.
        Must follow the instructions and write the content as per the guidelines provided. Dont return the intro or paragraph title like "**Photo Page Description for El Colomaniano**". Write only the content.

        The BRAND name and customer reviews will be provided below:
        STRICTLY FOLLOW Length of content : *** The CONTENT MUST BE ABOUT 200 WORDS. ABOUT 220 TOKEN. *** .
    ------    
    2. Second task: Classify the BRAND into the most suitable category and summarize its advantages and disadvantages based on the reviews.

        Brand Classification:
        Objective: Assign/Label 3 suitable categories approriate to the BRAND.
        Brand Categorization: Select the most fitting category from the provided list of categories.
        Format:
        Brand Sub-Category: [Appropriate category 1]###[Appropriate category 2]###[Appropriate category 3]
    ------
    3. Third task:
        You are an expert critic responsible for evaluating a brand based on customer reviews. Your goal is to assign scores for each criterion listed below.

        Score Range: Each score must be between 6.0 and 9.5, inclusive.
        Precision: Scores must be expressed to one decimal place (e.g., 6.1, 7.8, 9.4) and should not be limited to increments of 0.5.
        Evaluation: Base your scores strictly on the analysis of the reviews, reflecting the sentiments and insights gathered from them.
        Format: Present your findings in the following structured format:
        Brand Criteria Score: [Criteria 1]### [Score of Criteria 1]### [Criteria 2]### [Score of Criteria 2]### [Criteria 3]### [Score of Criteria 3],...
        Please ensure that your scores accurately reflect the reviews and demonstrate a strict adherence to the specified guidelines
    ------
    Inputs:
    BRAND: {name}
    BRAND_ADDRESS: {address}
    Reviews: {review}
    List of Categories: {subcate_lst}
    List of Criteria: {criteria_lst}

    Ideal Output Format:
    Photo Page Content: [Photo Page Content Output]
    Brand Sub-Category: [Appropriate category 1]###[Appropriate category 2]###[Appropriate category 3]
    Brand Criteria Score: [Criteria 1]###[Score 1]###[Criteria 2]###[Score 2]###[Criteria 3]###[Score 3],...
    Ensure the content and classifications accurately reflect the BRAND as described in the reviews, delivering a detailed overview and precise categorization."""
    return promt

In [39]:
def extract_response(response):
    content, subcategory = None, None

    text = response['choices'][0].message.content

    content = text.split('Brand Sub-Category:')[0].strip()
    content = content.replace('Photo Page Content:', '').strip()
    content = content.replace('\n', ' ')

    subcategory = text.split('Brand Sub-Category:')[1].strip()
    subcategory = subcategory.split('Brand Criteria Score:')[0].strip()
    subcategory = [x.strip() for x in subcategory.split('###')]

    criteria = text.split('Brand Criteria Score:')[1].strip()

    criteria_data = criteria.split("###")

    # criteria_dict = {criteria_data[i]: float(criteria_data[i + 1]) for i in range(0, len(criteria_data), 2)}
    criteria_dict = {}
    for i in range(0, len(criteria_data), 2):
        try:
            criteria_dict[criteria_data[i]] = float(criteria_data[i + 1])
        except ValueError:
            continue  # Skip this loop iteration if conversion to float fails

    return content, subcategory, criteria_dict

In [40]:
criteriafile = r"D:\Wheree_Kiet\Input\danhmucdanhgia.csv"
criteria = pd.read_csv(criteriafile)
criteria_lst = criteria.groupby('cat')['sub_cat'].apply(list).to_dict()

In [41]:
catefile = r"D:\Wheree_Kiet\Input\category.csv"
cate_df = pd.read_csv(catefile)
cate_lst = cate_df['category'].tolist()

In [42]:
subfile = r'D:\Wheree_Kiet\Input\catesub.csv'
sub_df = pd.read_csv(subfile)
sub_lst = sub_df.groupby('category')['sub_category'].apply(list).to_dict()

In [43]:
api_key = 'YOUR_API_KEY'
base_url="https://open.keyai.shop/v1"

In [52]:
source_path = r"D:\DATA\2024\Sep\Combine\10_10_2024\grouped_reviews.csv"
base_dir = r"D:\DATA\2024\Sep\Combine\10_10_2024"
success_path = os.path.join(base_dir, 'photo_content.csv')
header = 'code,content,type,subcate1,subcate2,subcate3,sub_score,input_token,output_token\n'


main_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main.csv")
main_content = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main_content.csv")
map_name = dict(zip(main_df['code'], main_df['orignalname']))
map_address = dict(zip(main_df['code'], main_df['fulladdress']))
map_category = dict(zip(main_content['code'], main_content['category']))
df = pd.read_csv(source_path)


df = df[df['code'].isin(main_df['code'])]
df =  collectRecordsTobeChecked(success_path, df, header)
df['name'] = df['code'].map(map_name)
df['address'] = df['code'].map(map_address)
df['category'] = df['code'].map(map_category)
max_word  = 500

  main_df = pd.read_csv(r"D:\DATA\2024\Sep\Combine\10_10_2024\main.csv")


Crawler - D:\DATA\2024\Sep\Combine\10_10_2024\photo_content.csv - File path checking completed.
Total batch records: 221103 records, Skipped: 221095 records, Ordered to be processed: 8


In [46]:
def process_row(row):

    review = row['content'] if 'content' in row else (row['reviews'] if 'reviews' in row else row['review'])
    
    review = " ".join(review.split(' ')[:min(max_word, len(review.split(' ')))])
    
    name = row['name']
    address = row['address']

    prompt = create_prompt_photo(review, name, address, sub_lst[row['category']], criteria_lst[row['category']])
    

    response = promptGPT(prompt, api_key, base_url)
    response = dict(response)
    
    if response['choices'][0].message.content:
        content, subcategory, criteria_dict = extract_response(response)

        if len(content.split(' ')) > 100:
            sub_1 = subcategory[0] if subcategory[0] in sub_lst[row['category']] else None
            if sub_1:
                sub_1 = sub_1.replace('*', '').strip()
                sub_2 = subcategory[1] if subcategory[1] in sub_lst[row['category']] else None
                if sub_2:
                    sub_2 = sub_2.replace('*', '').strip()
                    sub_3 = subcategory[2] if subcategory[2] in sub_lst[row['category']] else None
                    if sub_3:
                        sub_3 = sub_3.replace('*', '').strip()
                    else:
                        sub_3 = None
                else:
                    sub_3 = None
            else:
                return

            # Write main info
            result_entry = {
                'code': row['code'],
                'content': content,
                'type': 'PHOTO.S20.P1.C1',
                'subcate1': sub_1,
                'subcate2': sub_2,
                'subcate3': sub_3,
                'sub_score': criteria_dict,
                'input_token': response['usage'].prompt_tokens,
                'output_token': response['usage'].completion_tokens
            }
            writeMainInfo(result_entry, success_path)

In [31]:
with ProcessPoolExecutor(max_workers=5) as executor:  
    futures = [executor.submit(process_row, row) for _, row in df.iterrows()]
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing rows"):
        try:
            future.result()
        except Exception as e:
            # print(f"Error processing row {e}")
            pass

Processing rows: 100%|██████████| 1101/1101 [03:13<00:00,  5.70it/s]
