In [None]:
#讀取資料集：如果資料集過大，可以先分割；小資料集則可以一次性讀取

import pandas as pd
import os

class DataLoader:
    def __init__(self):
        self.dataframes = {}

    def load_csv(self, name, filepath):
        """讀取 CSV 並以指定名稱儲存"""
        try:
            df = pd.read_csv(filepath)
            self.dataframes[name] = df
            print(f"{name} 資料已載入，共 {len(df)} 筆資料。")
        except Exception as e:
            print(f"載入 {filepath} 時發生錯誤：{e}")

    def preview(self, name, n=2):
        """顯示指定資料的前 n 筆與後 n 筆"""
        if name in self.dataframes:
            df = self.dataframes[name]
            print(f"=== {name} 頭 {n} 筆資料 ===")
            print(df.head(n))
            print(f"=== {name} 後 {n} 筆資料 ===")
            print(df.tail(n))
        else:
            print(f"{name} 尚未載入，請先使用 load_csv()。")

    def get_dataframe(self, name):
        """取得已載入的 DataFrame"""
        return self.dataframes.get(name, None)

    def split_csv(self, input_file, output_folder, parts=20, encoding='utf-8'):
        """將大型 CSV 分割為多份"""
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # 計算總筆數（不含標題列）
        try:
            total_rows = sum(1 for _ in open(input_file, encoding=encoding)) - 1
            chunk_size = total_rows // parts + 1
            print(f"總筆數: {total_rows}, 每份約 {chunk_size} 筆")

            chunk_number = 0
            for chunk in pd.read_csv(input_file, chunksize=chunk_size, encoding=encoding, low_memory=False):
                output_path = os.path.join(output_folder, f"chunk_{chunk_number + 1}.csv")
                chunk.to_csv(output_path, index=False, encoding=encoding)
                chunk_number += 1
                print(f"已分割第 {chunk_number} 份：{output_path}")

            print("分割完成")
        except Exception as e:
            print(f"分割 {input_file} 時發生錯誤：{e}")


In [None]:
#合併資料，只取ShopMemberId, SalePageId, SalePageTitle三個欄位

import pandas as pd
import os
import glob

def batch_generate_order_summaries(
    ordersplit_dir, 
    salepage_csv_path, 
    output_dir
):
    # 讀取 SalePage.csv
    try:
        salepage_df = pd.read_csv(salepage_csv_path, dtype=str)
    except Exception as e:
        print(f"讀取 {salepage_csv_path} 時發生錯誤：{e}")
        return

    if 'SalePageId' not in salepage_df.columns or 'SalePageTitle' not in salepage_df.columns:
        print("SalePage 資料缺少必要欄位")
        return

    os.makedirs(output_dir, exist_ok=True)

    # 收集 chunk 檔案，排除 chunk_1.csv 和 chunk_20.csv
    chunk_files = sorted(glob.glob(os.path.join(ordersplit_dir, "chunk_*.csv")))
    chunk_files = [
        f for f in chunk_files 
        if not (f.endswith("chunk_1.csv") or f.endswith("chunk_20.csv"))
    ]

    if not chunk_files:
        print("找不到有效的 chunk_*.csv 檔案")
        return

    for chunk_file in chunk_files:
        filename = os.path.basename(chunk_file)
        output_path = os.path.join(output_dir, f"output_{filename}")

        try:
            order_df = pd.read_csv(chunk_file, dtype=str)
        except Exception as e:
            print(f"讀取 {filename} 發生錯誤：{e}")
            continue

        if 'ShopMemberId' not in order_df.columns or 'SalePageId' not in order_df.columns:
            print(f"{filename} 缺少必要欄位，跳過")
            continue

        merged = order_df[['ShopMemberId', 'SalePageId']].copy()
        merged = merged.merge(
            salepage_df[['SalePageId', 'SalePageTitle']], 
            on='SalePageId', 
            how='left'
        )

        try:
            merged.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f"已輸出：{output_path} ({len(merged)} 筆)")
        except Exception as e:
            print(f"儲存 {output_path} 時錯誤：{e}")


In [None]:
#篩選資料：刪除'退貨'的資料、留下 SalePageId 有值的資料、補上SalePageTitle的值、排除不需要的資料

import pandas as pd
import os
import glob

def filter_return_status(df):
    """過濾掉 StatusDef 為 'Return' 的資料"""
    if 'StatusDef' in df.columns:
        return df[df['StatusDef'] != 'Return'].copy()
    return df

def batch_generate_order_summaries(
    ordersplit_dir, 
    salepage_csv_path, 
    output_dir
):
    # 讀取 SalePage.csv
    try:
        salepage_df = pd.read_csv(salepage_csv_path, dtype=str)
    except Exception as e:
        print(f"讀取 {salepage_csv_path} 時發生錯誤：{e}")
        return

    if 'SalePageId' not in salepage_df.columns or 'SalePageTitle' not in salepage_df.columns:
        print("SalePage 資料缺少必要欄位")
        return

    os.makedirs(output_dir, exist_ok=True)

    # 收集所有 chunk_*.csv 檔案（不再排除 chunk_1.csv 和 chunk_20.csv）
    chunk_files = sorted(glob.glob(os.path.join(ordersplit_dir, "chunk_*.csv")))

    if not chunk_files:
        print("找不到有效的 chunk_*.csv 檔案")
        return

    for chunk_file in chunk_files:
        filename = os.path.basename(chunk_file)
        output_path = os.path.join(output_dir, f"output_{filename}")

        try:
            order_df = pd.read_csv(chunk_file, dtype=str)
        except Exception as e:
            print(f"讀取 {filename} 發生錯誤：{e}")
            continue

        # 過濾 StatusDef = 'Return'
        order_df = filter_return_status(order_df)

        if 'ShopMemberId' not in order_df.columns or 'SalePageId' not in order_df.columns:
            print(f"{filename} 缺少必要欄位，跳過")
            continue

        merged = order_df[['ShopMemberId', 'SalePageId']].copy()
        merged = merged.merge(
            salepage_df[['SalePageId', 'SalePageTitle']], 
            on='SalePageId', 
            how='left'
        )

        try:
            merged.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f"已輸出：{output_path} ({len(merged)} 筆)")
        except Exception as e:
            print(f"儲存 {output_path} 時錯誤：{e}")


In [None]:
#統計分析

import pandas as pd

class DataAnalyzer:
    def __init__(self, filepath):
        """初始化並載入資料"""
        try:
            self.filepath = filepath
            self.df = pd.read_csv(filepath, dtype=str)
            self.total_rows = len(self.df)
            print(f"資料已載入：{filepath}")
            print(f"資料筆數: {self.total_rows}")
            print(f"欄位名稱: {self.df.columns.tolist()}")
        except Exception as e:
            print(f"讀取資料時發生錯誤：{e}")
            self.df = None
            self.total_rows = 0

    def describe(self):
        """輸出基本統計描述"""
        if self.df is not None:
            print("\n基本統計描述：")
            print(self.df.describe(include='all'))
        else:
            print("尚未載入資料，無法執行描述統計。")

    def show_top10_cumulative(self, column_name):
        """顯示欄位值前10名的出現次數與累積百分比"""
        if self.df is not None:
            if column_name in self.df.columns:
                counts = self.df[column_name].value_counts(dropna=False)
                top10 = counts.head(10)
                percent = top10 / self.total_rows * 100
                cum_percent = percent.cumsum().round(4)
                summary = pd.DataFrame({
                    'count': top10,
                    'cumulative_percentage (%)': cum_percent
                })
                print(f"\n{column_name} Top 10 出現頻率與累積百分比：")
                print(summary)
            else:
                print(f"欄位 '{column_name}' 不存在。")
        else:
            print("尚未載入資料，無法顯示統計。")


In [None]:
#分群、降維、可視化

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(filepath, top_n_items=20):
    """讀取資料並建立 0/1 會員-商品矩陣"""
    df = pd.read_csv(filepath)
    top_items = df['SalePageId'].value_counts().head(top_n_items).index
    df_filtered = df[df['SalePageId'].isin(top_items)]
    user_item = pd.crosstab(df_filtered['ShopMemberId'], df_filtered['SalePageId']).clip(upper=1)
    return user_item

def cluster_users(user_item, n_clusters=10, pca_components=2, cluster_on_pca=True):
    """執行標準化、降維（PCA）和分群"""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(user_item)

    if cluster_on_pca:
        pca = PCA(n_components=pca_components)
        X_pca = pca.fit_transform(X_scaled)
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=10000)
        clusters = kmeans.fit_predict(X_pca)
        user_item['Cluster'] = clusters
        return user_item, X_pca, clusters
    else:
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=10000)
        clusters = kmeans.fit_predict(X_scaled)
        user_item['Cluster'] = clusters
        return user_item, X_scaled, clusters

def plot_clusters(X_2d, clusters, title='K-Means Cluster Result (After PCA)'):
    """繪製 2D 散佈圖"""
    plot_df = pd.DataFrame(X_2d, columns=['PCA1', 'PCA2'])
    plot_df['Cluster'] = clusters

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=plot_df, x='PCA1', y='PCA2', hue='Cluster', palette='tab10')
    plt.title(title)
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.show()


In [None]:
if __name__ == "__main__":
    
    
#     loader = DataLoader()
    
#     #一次性讀取資料集
#     loader.load_csv("salepage", "C:/Users/user/Desktop/91APPDataset/SalePage.csv")
#     loader.preview("salepage")
#     salepage_df = loader.get_dataframe("salepage")
    
#     loader.load_csv("order2", "C:/Users/user/Desktop/Ordersplit/chunk_2.csv")
#     loader.preview("order2")
#     salepage_df = loader.get_dataframe("order1")
    
#     #分割大型資料
#     loader.split_csv("C:/Users/user/Desktop/91APPDataset/Order_TS.csv", 
#                      "C:/Users/user/Desktop/Ordersplit", 
#                      parts=20, encoding="utf-8"
#     )

#     #合併資料、同時刪除退貨資料
#     batch_generate_order_summaries(
#         ordersplit_dir="C:/Users/user/Desktop/Ordersplit",
#         salepage_csv_path="C:/Users/user/Desktop/91APPDataset/SalePage.csv",
#         output_dir="C:/Users/user/Desktop/Ordersplit_Output"
#     )
    
    
#     #篩遠資料
#     # 1. 篩選有效的 SalePageId 的資料
#     f = DataFilter()
#     f.merge_valid_salepage_rows(
#         input_dir=r"C:/Users/user/Desktop/Ordersplit_Output",
#         output_path=r"C:/Users/user/Desktop/Ordersplit_Output2/merged_salepage_valid.csv"
#     )
#     # 2. 補上 SalePageTitle
#     f.fill_salepage_title(
#         salepage_csv_path=r"C:/Users/user/Desktop/91APPDataset/SalePage.csv",
#         output_path=r"C:/Users/user/Desktop/Ordersplit_Output2/filled_salepage_data.csv"
#     )
#     #出現多次的商品是：id=7855864 (贈)vivo數位印花、id=8995530 (贈)印花貼紙  
#     # 3. 排除特定 SalePageId
#     f.exclude_salepage_ids(
#         exclude_ids=['7855864', '8995530'],
#         output_path=r"C:/Users/user/Desktop/Ordersplit_Output2/filtered_salepage_data.csv"
#     )
    
    
    #資料集的統計資料
    analyzer = DataAnalyzer(r"C:/Users/user/Desktop/Ordersplit_Output2/filtered_salepage_data.csv")
    analyzer.describe()
    analyzer.show_top10_cumulative("SalePageId")
    

    # 選出前20筆商品來分群、分群與降維、繪圖、顯示每群的購買偏好
    file_path = 'C:/Users/user/Desktop/Ordersplit_Output/filtered_salepage_data.csv'
    user_item = load_and_preprocess_data(file_path, top_n_items=20)
    user_item, X_pca, clusters = cluster_users(user_item, n_clusters=10, pca_components=2, cluster_on_pca=True)
    plot_clusters(X_pca, clusters)
    cluster_profiles = user_item.groupby('Cluster').mean()
    print(cluster_profiles)

    