In [1]:
import pandas as pd

# Loading

In [2]:
def load_txt2df(file_name: str) -> pd.DataFrame:
    reviews = []
    current_review = []

    try:
        with open(file_name, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("#"):
                    if current_review:
                        reviews.append(" ".join(current_review).strip())
                        current_review = []
                elif line:
                    current_review.append(line)
            if current_review:
                reviews.append(" ".join(current_review).strip())

        df = pd.DataFrame(reviews, columns=["review"])
        print(f"Đã load thành công {len(df)} reviews từ file '{file_name}'")
        return df

    except Exception as e:
        print(f"Lỗi khi đọc file '{file_name}': {e}")
        return pd.DataFrame(columns=["review"])

In [None]:
# data = load_txt2df("./annotate_Tien.txt")

Đã load thành công 750 reviews từ file './Dung.txt'


In [None]:
# data = pd.concat([data1, data2], ignore_index=True)

# Saving

In [4]:
def save_txt(reviews_list, file_name):
    try:
        with open(file_name, 'w', encoding='utf-8') as f:
            for index, review in enumerate(reviews_list, start=1):
                header = f"#{index}\n"
                f.write(header)
                f.write(review.strip() + "\n\n")
        print(f"Đã lưu {len(reviews_list)} reviews vào '{file_name}'")
    except IOError as e:
        print(f"Lỗi khi ghi file {file_name}: {e}")

In [None]:
# save_txt(data['review'].tolist(), "reviews_Tien.txt")

Đã lưu 800 reviews vào 'reviews_Tien.txt'


# Format txt

In [2]:
import re
import numpy as np

In [3]:
aspects = [
    'hotel#general', 'hotel#prices', 'hotel#design&features', 'hotel#cleanliness', 'hotel#comfort', 'hotel#quality', 'hotel#miscellaneous',
    'rooms#general', 'rooms#prices', 'rooms#design&features', 'rooms#cleanliness', 'rooms#comfort', 'rooms#quality', 'rooms#miscellaneous',
    'room_amenities#general', 'room_amenities#prices', 'room_amenities#design&features', 'room_amenities#cleanliness', 'room_amenities#comfort', 'room_amenities#quality', 'room_amenities#miscellaneous',
    'facilities#general', 'facilities#prices', 'facilities#design&features', 'facilities#cleanliness', 'facilities#comfort', 'facilities#quality', 'facilities#miscellaneous',
    'service#general',
    'location#general',
    'food&drinks#prices', 'food&drinks#quality', 'food&drinks#style&options', 'food&drinks#miscellaneous' ]

sentiments = ['dne', 'positive', 'negative', 'neutral']
all_keys = aspects

In [4]:
# Chuyển định dạng reviews từ file .txt -> Dataframe
def text2df(filepath, all_keys=all_keys):
    
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()
        
    blocks = re.split(r'(?m)^#\d+\s*', text.strip())
    blocks = [b.strip() for b in blocks if b.strip()]

    data, found_keys = [], set()
    for b in blocks:
        parts = b.rsplit('\n', 1)
        review, ann_line = parts if len(parts) == 2 else (b, "")
        anns = re.findall(r'\{([^,{}]+),\s*([^{}]+)\}', ann_line)
        anns = [(a.strip().lower(), s.strip().lower()) for a, s in anns]
        found_keys.update(a for a, _ in anns)
        data.append((review.strip(), anns))

    rows = []
    for review, anns in data:
        row = {"review": review}
        for k in all_keys:
            row[k] = np.nan
        for a, s in anns:
            if a in row:
                row[a] = s
        rows.append(row)

    return pd.DataFrame(rows)

In [15]:
data0 = text2df("annotated_reviews_final.txt")
data1 = text2df("./Dung.txt")
data2 = text2df("./Thuc.txt")
data3 = text2df("./Tien.txt")

In [16]:
data = pd.concat([data0, data1, data2, data3], ignore_index=True)

In [5]:
def save_txt(df, output_filepath, aspect_cols=aspects):
    output_text = []
    
    for i, (_, row) in enumerate(df.iterrows(), start=1):
        
        header = f"#{i}\n"
        review_content = str(row['review']).strip()
        
        annotations = []
        for key in aspect_cols:
            sentiment = row[key]
            
            # Kiểm tra valid sentiment
            if pd.notna(sentiment) and sentiment is not None and str(sentiment).strip() != '':
                aspect_formatted = key.strip().lower()
                sentiment_formatted = str(sentiment).strip().lower()
                annotations.append(f"{{{aspect_formatted}, {sentiment_formatted}}}")
        
        # Nối review và annotation lại
        block = header + review_content
        
        if annotations:
            ann_line = ' '.join(annotations)
            block += '\n' + ann_line
            
        output_text.append(block)
    
    # Nối tất cả các blocks lại bằng 1 dòng trống (\n\n)
    final_content = '\n\n'.join(output_text)
    
    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(final_content)
        return f"✅ Đã lưu thành công {len(df)} reviews vào file: {output_filepath}"
    except Exception as e:
        return f"❌ Lỗi khi lưu file: {e}"

In [19]:
save_txt(data, "data_full.txt")

'✅ Đã lưu thành công 2393 reviews vào file: **data_full.txt**'