In [None]:
import re
import pandas as pd
from collections import Counter
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

# KoBART 모델 및 토크나이저 로드
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v2')
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v2', 
                                                    bos_token='</s>', eos_token='</s>', 
                                                    pad_token='<pad>')

# 단어 내 반복 글자 및 중복 단어 제거 함수
def preprocess_text(text):
    if pd.isnull(text):  # NaN 값 처리
        return ""
    text = str(text)  # 문자열로 변환
    words = text.split()

    # 단어 내 반복 글자 제거
    def remove_repeated_patterns(word):
        pattern = re.compile(r"(.{1,1})\1+")
        return pattern.sub(r"\1", word)

    processed_words = [remove_repeated_patterns(word) for word in words]

    # 전체 단어에서 중복 제거
    unique_words = []
    for word in processed_words:
        if word not in unique_words:
            unique_words.append(word)

    return ' '.join(unique_words)

# 리뷰 전처리 함수
def preprocess_reviews(reviews):
    processed_reviews = []
    for review in reviews:
        review = preprocess_text(review)  # 텍스트 전처리 적용
        review = re.sub(r'[^\w\sㄱ-ㅎㅏ-ㅣ가-힣]', '', str(review))  # 특수문자 제거
        review = re.sub(r'\s+', ' ', review.strip())  # 공백 정리
        processed_reviews.append(review)
    return list(set(processed_reviews))  # 중복 제거

# 토큰 기준으로 리뷰 그룹화
def group_reviews_by_tokens(reviews, tokenizer, max_tokens=512):
    grouped_reviews = []
    current_group = []
    current_token_count = 0

    for review in reviews:
        token_count = len(tokenizer.encode(review, truncation=False))
        
        if current_token_count + token_count > max_tokens:
            if current_group:
                grouped_reviews.append(" ".join(current_group))
            current_group = [review]
            current_token_count = token_count
        else:
            current_group.append(review)
            current_token_count += token_count

    if current_group:
        grouped_reviews.append(" ".join(current_group))
    
    return grouped_reviews

# 중복 문장 및 단어 제거 함수
def clean_text(text):
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    sentence_counts = Counter(sentences)
    unique_sentences = [sentence for sentence in sentences if sentence_counts[sentence] == 1]
    text = ". ".join(unique_sentences) + "."

    words = text.split()
    filtered_words = []
    last_word = None
    for word in words:
        if word != last_word:
            filtered_words.append(word)
            last_word = word
    text = " ".join(filtered_words)

    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 요약 함수
def summarize_groups(groups, model, tokenizer, max_input_length=512, max_output_length=200):
    summaries = []
    for group in groups:
        if not group.strip():
            continue

        try:
            group = preprocess_text(group)
            input_ids = tokenizer.encode(group, return_tensors="pt", max_length=max_input_length, truncation=True)
            summary_ids = model.generate(input_ids, max_length=max_output_length, num_beams=6, early_stopping=True)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summary = clean_text(summary)
            summaries.append(summary)
        except Exception as e:
            print(f"Error summarizing group: {e}")
    
    return summaries

# 재귀적으로 요약 그룹화 및 요약
def recursive_group_and_summarize(groups, model, tokenizer, max_tokens=512, max_input_length=512, max_output_length=200):
    while len(groups) > 1:
        combined_text = " ".join(groups)
        combined_text = clean_text(combined_text)
        new_groups = group_reviews_by_tokens(combined_text.split(". "), tokenizer, max_tokens=max_tokens)
        groups = summarize_groups(new_groups, model, tokenizer, max_input_length, max_output_length)

    final_summary = groups[0] if groups else ""
    if len(tokenizer.encode(final_summary, truncation=False)) > max_output_length:
        final_summary = summarize_groups([final_summary], model, tokenizer, max_input_length, max_output_length)[0]

    return final_summary

# 엑셀 파일 읽기 및 실행
file_path = "압구정찐최종데이터.xlsx"
data = pd.read_excel(file_path)

# 모든 가게 요약
store_names = data['가게명'].unique()
results = []

for store_name in store_names:
    store_reviews = data[data['가게명'] == store_name]['리뷰']
    processed_reviews = preprocess_reviews(store_reviews)
    grouped_reviews = group_reviews_by_tokens(processed_reviews, tokenizer, max_tokens=512)
    group_summaries = summarize_groups(grouped_reviews, model, tokenizer)
    final_summary = recursive_group_and_summarize(group_summaries, model, tokenizer, max_tokens=512)
    final_summary = clean_text(final_summary)
    results.append({'가게명': store_name, '리뷰 요약': final_summary})

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 엑셀로 저장
output_file_path = "1모든_가게_리뷰_요약.xlsx"
results_df.to_excel(output_file_path, index=False)

print(f"Processed summaries saved to: {output_file_path}")

In [None]:
import re
import pandas as pd

#반복적인 언어가 처리되지 못해 한번더 처리해줬음.
def preprocess_text(text):
    # 1. 텍스트를 띄어쓰기 기준으로 나눔
    words = text.split()

    # 2. 단어 내 반복 글자 제거
    def remove_repeated_patterns(word):
        # 최대 3글자까지 반복되는 패턴 제거
        pattern = re.compile(r"(.{1,2})\1+")
        return pattern.sub(r"\1", word)

    processed_words = [remove_repeated_patterns(word) for word in words]

    # 3. 전체 단어에서 중복 제거 (고유 단어만 유지)
    unique_words = []
    for word in processed_words:
        if word not in unique_words:
            unique_words.append(word)

    # 4. 결과를 다시 텍스트로 변환
    return ' '.join(unique_words)

# 엑셀 파일 읽기
file_path = '압구정_최종_요약.xlsx'
data = pd.read_excel(file_path)

# 리뷰 요약 전처리 적용
data['전처리된 리뷰 요약'] = data['리뷰 요약'].apply(preprocess_text)

# 결과를 새로운 엑셀 파일로 저장
output_path = '압구정_전처리된_가게_리뷰.xlsx'
data.to_excel(output_path, index=False)

# 결과 출력
print(f"전처리가 완료되었습니다. 결과는 {output_path}에 저장되었습니다.")