In [None]:
from utils.query_model import *
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import sys

In [None]:
df = pd.read_csv('../data/guland_hanoi_listings_arcgis3.csv')

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
t, p, a, d, lat, lon, pm2, nl = "Title", "Price", "Area", "Description", "Latitude", "Longitude", "Price per m2", "\n"

def create_general_info(df):
    # Convert Description NaNs to empty string
    description = df[d].fillna("Không có mô tả").astype(str)
    
    prince_info = "Giá nhà: " + (df[p]/1000000).astype(str) + "tỷ" + nl
    area_info = "Diện tích: " + df[a].astype(str) + "m2" + nl
    lat_info = "Vĩ độ: " + df[lat].astype(str) + nl
    lon_info = "Kinh độ: " + df[lon].astype(str) + nl
    pm2_info = "Giá/m2: " + (df[pm2]).astype(str) + " triệu" + nl

    return df[t] + nl + description + nl + prince_info + area_info + lat_info + lon_info + pm2_info

In [None]:
model = "Qwen/Qwen2.5-0.5B-Instruct"
device = "auto"
max_new_tokens = 5
batch_inference = True
batch_size = 20
batch_inference = True
temperature = 0.1
top_p = 0.3
length_df = df.shape[0]
max_retries = 4

In [None]:
sys_prompt = """Bạn là chuyên gia phân loại bất động sản Việt Nam.
Nhiệm vụ của bạn là phân loại mô tả bất động sản thành căn hộ chung cư (0) hoặc nhà riêng (1).
Bạn CHỈ được trả lời bằng một chữ số: 0 hoặc 1.

Căn hộ chung cư (0): Nằm trong tòa nhà nhiều tầng, thường có từ khóa như "chung cư", "căn hộ", "tầng", "thang máy".
Nhà riêng (1): Công trình độc lập có đất riêng, thường có từ khóa như "nhà riêng", "nhà mặt phố", "biệt thự", "ngõ", "mặt tiền".
"""

In [None]:
df_new = pd.DataFrame()
df_new['general_info'] = create_general_info(df)
df_new.info()

In [None]:
df_new.head()

In [None]:
print(f"Loading model: {model}")
interface = ModelQueryInterface()
interface.load_model(model_name=model, device_map=device)

In [None]:
import gc
gc.collect()

# Initialize tracking variables
to_process = list(range(len(df_new)))  # Indices of items to process
results = [None] * len(df_new)  # Store results here
retry_counts = [0] * len(df_new)  # Track retry attempts

In [None]:

# Create 3 progress bars
main_progress = tqdm(total=len(df_new), desc="Overall Progress", position=0)
retry_progress = tqdm(total=0, desc="Pending Retries", position=1, bar_format='{desc}: {n_fmt}/{total_fmt} [{bar}] {percentage:3.0f}%')
success_progress = tqdm(total=100, desc="Success Rate", position=2, bar_format='{desc}: {n:3.0f}% [{bar}] {postfix}')

# Statistics tracking
total_processed = 0
total_success = 0

# Continue processing until all items are done or max retries reached
while to_process:
    batch_size_current = min(batch_size, len(to_process))
    
    # Take next batch
    current_batch_indices = to_process[:batch_size_current]
    current_batch_texts = [df_new.iloc[i]['general_info'] for i in current_batch_indices]
    
    batch_prompt = []
    for text in current_batch_texts:
        prompt = f"""Hãy phân loại bất động sản này 0 (căn hộ chung cư) hoặc 1 (nhà riêng):

        {text}

        0 (căn hộ chung cư) hoặc 1 (nhà riêng)
        Phân loại (CHỈ trả lời bằng 0 hoặc 1):"""
        batch_prompt.append(prompt)
    
    # Process batch
    responses = interface.query_model_batch(
        prompts=batch_prompt, 
        max_new_tokens=1,  # Reduced to 1 token
        temperature=0.1,   # Lower temperature for consistent outputs
        top_p=top_p,
        system_prompt=sys_prompt
    )
    
    # Process results and identify items needing retry
    retry_indices = []
    successful = 0
    
    for i, (idx, response) in enumerate(zip(current_batch_indices, responses)):
        # Extract only digits from response
        digits = ''.join(char for char in response if char in '01')
        clean_response = digits[:1] if digits else ""
        
        # Check if valid (0 or 1)
        if clean_response in ["0", "1"]:
            results[idx] = int(clean_response)
            successful += 1
        else:
            retry_counts[idx] += 1
            
            # Add to retry queue if under max retries
            if retry_counts[idx] < max_retries:
                retry_indices.append(idx)
            else:
                results[idx] = None
    
    # Remove processed items from queue
    to_process = to_process[batch_size:]
    
    # Add retry items to end of processing queue
    to_process.extend(retry_indices)
    
    # Update statistics
    total_processed += batch_size_current
    total_success += successful
    
    # Update progress bars
    main_progress.update(batch_size_current - len(retry_indices))
    retry_progress.total = len(to_process)
    retry_progress.refresh()
    
    # Update success rate (0-100%)
    success_rate = int((successful / batch_size_current) * 100)
    overall_success_rate = int((total_success / total_processed) * 100)
    success_progress.n = success_rate
    success_progress.set_postfix_str(f"Batch: {success_rate}% | Overall: {overall_success_rate}%")
    success_progress.refresh()

# Close progress bars
main_progress.close()
retry_progress.close()
success_progress.close()
    
df['property_type'] = results

# Final stats
print(f"\nClassification complete.")
print(f"Valid classifications: {df['property_type'].notna().sum()} ({overall_success_rate}%)")
print(f"Failed classifications: {df['property_type'].isna().sum()} ({100 - overall_success_rate}%)")

In [None]:
# # Create a progress bar for the main processing
# main_progress = tqdm(total=len(df_new), desc="Processing properties")
# retry_progress = tqdm(total=0, desc="Retries", position=1)

# # Continue processing until all items are done or max retries reached
# while to_process:
#     batch_size_current = min(batch_size, len(to_process))
    
#     # Take next batch
#     current_batch_indices = to_process[:batch_size_current]
#     current_batch_texts = [df_new.iloc[i]['general_info'] for i in current_batch_indices]
    
#     batch_prompt = [] # Create prompts for batch
#     for text in current_batch_texts:
#         prompt = f"""Please classify this property:

#     {text}

#     Classification (ONLY respond with 0 or 1):"""
#         batch_prompt.append(prompt)
    
#     # Process batch
#     responses = interface.query_model_batch(
#         prompts=batch_prompt, 
#         max_new_tokens=max_new_tokens, 
#         temperature=temperature, 
#         top_p=top_p,
#         system_prompt=sys_prompt
#     )
    
#     # Process results and identify items needing retry
#     retry_indices = []
#     successful = 0
    
#     for i, (idx, response) in enumerate(zip(current_batch_indices, responses)):
#         # Extract only digits from response
#         digits = ''.join(char for char in response if char in '01')
#         clean_response = digits[:1] if digits else ""
        
#         # Check if valid (0 or 1)
#         if clean_response in ["0", "1"]:
#             results[idx] = int(clean_response)
#             successful += 1
#             # Use tqdm.write to avoid breaking progress bar display
#             tqdm.write(f"Item {idx}: Successfully classified as {clean_response}")
#         else:
#             retry_counts[idx] += 1
#             # Use tqdm.write to avoid breaking progress bar display
#             tqdm.write(f"Item {idx}: Invalid response '{response.strip()}', retry {retry_counts[idx]}/{max_retries}")
            
#             # Add to retry queue if under max retries
#             if retry_counts[idx] < max_retries:
#                 retry_indices.append(idx)
#                 retry_progress.total += 1  # Increase total retries to track
#             else:
#                 results[idx] = None
    
    
#     # Remove processed items from queue
#     to_process = to_process[batch_size:]
    
#     # Add retry items to end of processing queue
#     to_process.extend(retry_indices)
    
#     # Update progress bars
#     main_progress.update(batch_size_current - len(retry_indices))
#     retry_progress.update(0)  # Refresh display
#     retry_progress.set_postfix({"remaining": len(to_process)})
#     main_progress.set_postfix({"success_rate": f"{successful}/{batch_size_current}"})

#         # Print batch summary
#     tqdm.write(f"Batch complete. {len(to_process)} items remaining.\n")
#     time.sleep(0.1)  # Small pause to ensure outputs are visible


# # Close progress bars
# main_progress.close()
# retry_progress.close()
    
# df['property_type'] = results

# # Check results
# print(f"Classification complete.")
# print(f"Valid classifications: {df['property_type'].notna().sum()}")
# print(f"Failed classifications: {df['property_type'].isna().sum()}")

In [None]:
df.to_csv('../data/guland_hanoi_listings_classified.csv', index=False)
