In [1]:
import os
import glob
import pandas as pd

# 데이터 파일이 있는 폴더 경로
folder_path = '/Users/yunho/Desktop/UP'  # <-- 필요에 따라 경로 수정

# 1. *_removed_rows.csv로 끝나는 파일 합치기
removed_files = glob.glob(os.path.join(folder_path, '*_removed_rows.csv'))
df_removed = pd.concat([pd.read_csv(file) for file in removed_files], ignore_index=True)
print(f"[removed] 파일 개수: {len(removed_files)}, 합친 행 수: {len(df_removed)}")

# 2. *_origin_updated.csv로 끝나는 파일 합치기
origin_updated_files = glob.glob(os.path.join(folder_path, '*_origin_updated.csv'))
df_origin_updated = pd.concat([pd.read_csv(file) for file in origin_updated_files], ignore_index=True)
print(f"[origin_updated] 파일 개수: {len(origin_updated_files)}, 합친 행 수: {len(df_origin_updated)}")

# (원하면) 결과를 파일로 저장
df_removed.to_csv('./all_removed_rows.csv', index=False)
df_origin_updated.to_csv('./all_origin_updated.csv', index=False)

[removed] 파일 개수: 13, 합친 행 수: 575
[origin_updated] 파일 개수: 14, 합친 행 수: 24323


In [3]:
import pandas as pd

updated_df = pd.read_csv('./all_origin_updated.csv')
removed_df = pd.read_csv('./all_removed_rows.csv')

len(updated_df), len(removed_df)

(24323, 575)

In [5]:
24323+575+14

24912

In [7]:
import pandas as pd
import re

mapping = {
    "LG ThinQ": [
        "LG application", "LG mobile app", "LG apps", "LG app",
        "application", "app", "ThinkQ", "ThinKQ"
    ],
    "remote controller": ["remote", "remote control", "controller"],
    "error": ["error code", "err code", "fault code", "error", "code error"],
    "alarm": ["alarm", "beep"],  # 'beep alarm' 예외!
    "feature": ["function", "feature"],
    "dispenser": ["dispenser", "water dispenser"],
    "ice dispenser": ["ice maker", "ice dispenser"],
    "Objet Collection": [
    "Objet Collection", "objet collection",     # 최상단: 가장 길고 구체적인 것
    "Objet C", "objet C",
    "Objet", "objet"
    ],
    "messenger": ["Kakaotalk", "Kakao talk", "messenger"],
    "washer": ["Washing machine"],
    "display": ["display", "screen"],
}

def expand_keyword_variants(word):
    variants = set()
    variants.add(word)
    variants.add(word.upper())
    variants.add(word.lower())
    variants.add(word.title())
    base = word.replace(' ', '')
    variants.add(base)
    variants.add(base.upper())
    variants.add(base.lower())
    variants.add(base.title())
    return variants

expanded_replace_dict = {}
for rep, words in mapping.items():
    for word in words:
        if word != "beep alarm":
            for variant in expand_keyword_variants(word):
                expanded_replace_dict[variant] = rep

def map_summary(text):
    if pd.isnull(text):
        return text
    text = re.sub(r'\bbeep alarm\b', '__BEEP_ALARM__', text, flags=re.IGNORECASE)
    # 긴 단어 먼저 매핑(부분 매칭 방지)
    for key in sorted(expanded_replace_dict, key=len, reverse=True):
        pattern = r'\b' + re.escape(key) + r'\b'
        text = re.sub(pattern, expanded_replace_dict[key], text)
    text = text.replace('__BEEP_ALARM__', 'beep alarm')
    return text

def extract_matched_terms(text):
    if pd.isnull(text):
        return []
    found_terms = []
    temp = re.sub(r'\bbeep alarm\b', '', text, flags=re.IGNORECASE)
    for key in expanded_replace_dict:
        pattern = r'\b' + re.escape(key) + r'\b'
        if re.search(pattern, temp):
            found_terms.append(key)
    return found_terms

# 사용 예시
file_path = "./all_origin_updated.csv"
df = pd.read_csv(file_path)

df['mapped_summary'] = df['generated_summary'].apply(map_summary)
df['matched_terms'] = df['generated_summary'].apply(extract_matched_terms)

mapped_rows = df[df['matched_terms'].apply(len) > 0]
print(len(mapped_rows), "rows with mapped terms found.")

11653 rows with mapped terms found.


In [9]:
# DataFrame을 CSV로 저장
df.to_csv('all_origin_updated_mapped.csv', index=False, encoding='utf-8-sig')
len(df)

24323

In [4]:
# 8. 결과 샘플 확인
mapped_rows[['generated_summary', 'mapped_summary', 'matched_terms']]

Unnamed: 0,generated_summary,mapped_summary,matched_terms
12,The user suggests a feature to schedule lighti...,The user suggests a feature to schedule lighti...,[feature]
14,The user suggests that the app should send an ...,The user suggests that the LG ThinQ should sen...,"[app, operation]"
16,The user suggests having a feature that allows...,The user suggests having a feature that allows...,[feature]
21,The user suggests adding a feature that allows...,The user suggests adding a feature that allows...,[feature]
23,The user suggests displaying the current tempe...,The user suggests displaying the current tempe...,[app]
...,...,...,...
3193,The user suggests upgrading the refrigerator t...,The user suggests upgrading the refrigerator t...,[display]
3194,The user suggests changing the refrigerator's ...,The user suggests changing the refrigerator's ...,[function]
3196,The user suggests creating a feature that allo...,The user suggests creating a feature that allo...,[feature]
3203,"The user suggests adding the ""fermentation und...","The user suggests adding the ""fermentation und...",[feature]


In [7]:
# 매핑된 terms들의 전체 빈도(출현 횟수) 집계
from collections import Counter

# matched_terms는 리스트이므로 전체 펼쳐서 집계해야 함
all_terms = [term for terms in df['matched_terms'] if isinstance(terms, list) for term in terms]
term_counts = Counter(all_terms)

# 결과를 DataFrame으로 보기 좋게 정리
term_count_df = pd.DataFrame(term_counts.items(), columns=['matched_term', 'count']).sort_values('count', ascending=False)
term_count_df


Unnamed: 0,matched_term,count
0,feature,623
1,app,396
6,alarm,346
7,option,220
3,function,198
11,dispenser,156
10,ice maker,104
13,water dispenser,99
4,display,77
12,ice dispenser,33


In [None]:
# 매핑된 terms들의 전체 빈도(출현 횟수) 집계
from collections import Counter

# matched_terms는 리스트이므로 전체 펼쳐서 집계해야 함
all_terms = [term for terms in df['matched_terms'] if isinstance(terms, list) for term in terms]
term_counts = Counter(all_terms)

# 결과를 DataFrame으로 보기 좋게 정리
term_count_df = pd.DataFrame(term_counts.items(), columns=['matched_term', 'count']).sort_values('count', ascending=False)
term_count_df

In [5]:
def find_rows_by_matched_terms(df, terms):
    mask = df['matched_terms'].apply(lambda x: any(term in x for term in terms) if isinstance(x, list) else False)
    return df.loc[mask, ['generated_summary', 'mapped_summary', 'matched_terms']]

featuer_result = find_rows_by_matched_terms(df, ['feature'])
print(featuer_result.head(10))

app_result = find_rows_by_matched_terms(df, ['app', 'LG ThinQ', 'LG application', 'LG apps', 'LG mobile app'])
print(app_result.head(10))

kakao_result = find_rows_by_matched_terms(df, ['Kakaotalk', 'Kakao talk', 'messenger'])
print(kakao_result.head(10))

objet_result = find_rows_by_matched_terms(df, ['Objet', 'Objet C', 'Objet Collection', 'objet', 'objet collection'])
print(objet_result.head(10))

screen_result = find_rows_by_matched_terms(df, ['display', 'screen'])
print(screen_result.head(10))

alarm_result = find_rows_by_matched_terms(df, ['alarm', 'beep'])
print(alarm_result.head(10))

                                    generated_summary  \
12  The user suggests a feature to schedule lighti...   
16  The user suggests having a feature that allows...   
21  The user suggests adding a feature that allows...   
27  The user suggests that all wine cellar product...   
36  The user suggests that the wine cellar should ...   
37  The user suggests adding an on/off feature to ...   
38  The user suggests adding a feature that allows...   
40  The user suggests improving the process of rec...   
41  The user suggests having a feature that allows...   
63  The user suggests including a feature that sen...   

                                       mapped_summary            matched_terms  
12  The user suggests a feature to schedule lighti...                [feature]  
16  The user suggests having a feature that allows...                [feature]  
21  The user suggests adding a feature that allows...                [feature]  
27  The user suggests that all wine cellar produ

In [8]:
featuer_result.head(5)

Unnamed: 0,generated_summary,mapped_summary,matched_terms
12,The user suggests a feature to schedule lighti...,The user suggests a feature to schedule lighti...,[feature]
16,The user suggests having a feature that allows...,The user suggests having a feature that allows...,[feature]
21,The user suggests adding a feature that allows...,The user suggests adding a feature that allows...,[feature]
27,The user suggests that all wine cellar product...,The user suggests that all wine cellar product...,[feature]
36,The user suggests that the wine cellar should ...,The user suggests that the wine cellar should ...,[feature]
