## Mount Google drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')
from pathlib import Path
Path("/gdrive/MyDrive/Colab Notebooks/").mkdir(parents=True, exist_ok=True)
%ls /gdrive/MyDrive/Colab\ Notebooks/

Mounted at /gdrive
correlation_heatmap.png                    [0m[01;34moutlier[0m/
[01;34mdistribution[0m/                              outlier_flagging.png
low_variance_features.png                  processed_train_X.csv
ObesityDataSet_raw_and_data_sinthetic.csv  processed_train_y.csv
obesity_test.csv                           [01;34msummary[0m/
obesity_test_y.csv                         target_balance.png
obesity_train.csv                          「Term_Project_Preprocessing.ipynb」的副本
obesity_train_X.csv                        Untitled0.ipynb
obesity_train_y.csv


# Import data and combine features and obesity types

In [None]:
import pandas as pd

# 讀取特徵資料與標籤資料
features_path = "/gdrive/MyDrive/Colab Notebooks/processed_train_X.csv"
labels_path = "/gdrive/MyDrive/Colab Notebooks/processed_train_y.csv"
features = pd.read_csv(features_path)
labels = pd.read_csv(labels_path)

# 檢查標籤資料格式
#print(labels.head())
#print(labels.info())

# 合併,分離outlier flag
data = pd.concat([features, labels], axis=1)
data.drop('outlier_flag', axis=1, inplace=True)

# 查看合併後的資料前幾行
print(data.head())

   CH2O  NCP  TUE  Age  FAF  FCVC  Gender_Male  \
0     2    3    1   21    0     2        False   
1     3    3    0   21    3     3        False   
2     2    3    1   23    2     2         True   
3     2    3    0   27    2     3         True   
4     2    1    0   22    0     2         True   

   family_history_with_overweight_yes  CALC_Frequently  CALC_Sometimes  \
0                                True            False           False   
1                                True            False            True   
2                                True             True           False   
3                               False             True           False   
4                               False            False            True   

   CALC_no  FAVC_yes  CAEC_Frequently  CAEC_Sometimes  CAEC_no  \
0     True     False            False            True    False   
1    False     False            False            True    False   
2    False     False            False            True   

# encode numerical data

In [None]:
# 將數值型欄位進行分區
#AGE (年齡)
age_bins = [0, 18, 30, 45, 60, 100]
age_labels = ['Teen', 'Young_Adult', 'Middle_Age', 'Senior', 'Elderly']
data['Age_Group'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels)
data.drop('Age', axis=1, inplace=True)

# FCVC (蔬菜攝取頻率)
fcvc_bins = [0, 1, 2, 3]
fcvc_labels = ['Low', 'Medium', 'High']
data['FCVC_Group'] = pd.cut(data['FCVC'], bins=fcvc_bins, labels=fcvc_labels)
data.drop('FCVC', axis=1, inplace=True)

# NCP (每日餐數)
ncp_bins = [0, 2, 3, 10]
ncp_labels = ['Low', 'Medium', 'High']
data['NCP_Group'] = pd.cut(data['NCP'], bins=ncp_bins, labels=ncp_labels)
data.drop('NCP', axis=1, inplace=True)

# CH2O (喝水量)
ch2o_bins = [0, 1, 2, 10]
ch2o_labels = ['Low', 'Medium', 'High']
data['CH2O_Group'] = pd.cut(data['CH2O'], bins=ch2o_bins, labels=ch2o_labels)
data.drop('CH2O', axis=1, inplace=True)

# FAF (運動頻率)
faf_bins = [0, 1, 3, 10]
faf_labels = ['Low', 'Medium', 'High']
data['FAF_Group'] = pd.cut(data['FAF'], bins=faf_bins, labels=faf_labels)
data.drop('FAF', axis=1, inplace=True)

# TUE (科技設備使用時間)
tue_bins = [0, 1, 2, 10]
tue_labels = ['Low', 'Medium', 'High']
data['TUE_Group'] = pd.cut(data['TUE'], bins=tue_bins, labels=tue_labels)
data.drop('TUE', axis=1, inplace=True)

# 使用 One-Hot 編碼對所有分區欄位進行編碼
data_encoded = pd.get_dummies(data, columns=[
    'Age_Group', 'FCVC_Group', 'NCP_Group', 'CH2O_Group', 'FAF_Group', 'TUE_Group','Gender_Male', 'family_history_with_overweight_yes', 'FAVC_yes','CALC_Frequently', 'CALC_Sometimes','CALC_no','CAEC_Frequently','CAEC_Sometimes','CAEC_no','NObeyesdad'
])

# 查看處理後的資料前幾行
print(data_encoded.head())
data_encoded.columns
data_encoded.to_csv('/gdrive/MyDrive/Colab Notebooks/data_encoded.csv', index=False)

   Age_Group_Teen  Age_Group_Young_Adult  Age_Group_Middle_Age  \
0           False                   True                 False   
1           False                   True                 False   
2           False                   True                 False   
3           False                   True                 False   
4           False                   True                 False   

   Age_Group_Senior  Age_Group_Elderly  FCVC_Group_Low  FCVC_Group_Medium  \
0             False              False           False               True   
1             False              False           False              False   
2             False              False           False               True   
3             False              False           False              False   
4             False              False           False               True   

   FCVC_Group_High  NCP_Group_Low  NCP_Group_Medium  ...  CAEC_Sometimes_True  \
0            False          False              True  ...   

# install pyECLAT

In [2]:
!pip install pyeclat

Collecting pyeclat
  Downloading pyECLAT-1.0.2-py3-none-any.whl.metadata (4.0 kB)
Downloading pyECLAT-1.0.2-py3-none-any.whl (6.3 kB)
Installing collected packages: pyeclat
Successfully installed pyeclat-1.0.2


# ECLAT

In [None]:
from pyECLAT import ECLAT
data = pd.read_csv('/gdrive/MyDrive/Colab Notebooks/data_encoded.csv')
df = pd.DataFrame(data)
transactions = []

#轉換為列表
for _, row in df.iterrows():
    transaction = [col for col, value in row.items() if value == 1]
    transactions.append(transaction)
df = df = pd.DataFrame(transactions)
# 初始化 Eclat 分析
eclat_instance = ECLAT(data=df,verbose=True)
eclat_instance.df_bin
rules, _ = eclat_instance.fit(min_support=0.05,min_combination=2,max_combination=11,separator=' ',verbose=True)

frequent_set_df = pd.DataFrame(rules.items(), columns=["Itemsets", "Support"])
frequent_set_df.to_csv('/gdrive/MyDrive/Colab Notebooks/frequent_item_set.csv', index=False)

ModuleNotFoundError: No module named 'pyECLAT'