In [2]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth

df = pd.read_csv('./data_to_transaction.csv',sep='delimiter', header=None, engine='python')

# list of lists
transactions = df[0].str.split(",").tolist()

# one-hot encoding DataFrame
# X：one-hot encoded transactions（input）。
# y：是否包含 frequent patterns（target）
unique_items = set()

for transaction in transactions:
    for item in transaction:
        unique_items.add(item)

all_items = sorted(unique_items)


def create_transaction_dataframe(transactions, all_items):
    transaction_data = [] 
    for trans in transactions:  
        transaction_row = []  
        for item in all_items:  
            if item in trans:  # 交易包含該項目
                transaction_row.append(1)  # 設為 1
            else:
                transaction_row.append(0)  
        transaction_data.append(transaction_row) 
    df_trans = pd.DataFrame(transaction_data, columns=all_items)  
    return df_trans

df_trans = create_transaction_dataframe(transactions, all_items) 


frequent_itemsets = fpgrowth(df_trans, min_support=0.01, use_colnames=True)

# 選擇兩項以上的frequent itemset 作為 ground truth
frequent_patterns = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) > 1)]




In [3]:
print(df_trans)

       -1  1010331  1010436  1010709  1010898  1011633  1011864  1011885  \
0       0        0        0        0        0        0        0        0   
1       0        0        0        0        0        0        0        0   
2       0        0        0        0        0        0        0        0   
3       0        0        0        0        0        0        0        0   
4       1        0        0        0        0        0        0        0   
...    ..      ...      ...      ...      ...      ...      ...      ...   
19934   0        0        0        0        0        0        0        0   
19935   0        0        0        0        0        0        0        0   
19936   0        0        0        0        0        0        0        0   
19937   0        0        0        0        0        0        0        0   
19938   0        0        0        0        0        0        0        0   

       1011927  1011948  ...  999159  999180  999285  999306  999369  999390  \
0      

  and should_run_async(code)


In [4]:
print(frequent_patterns)

       support            itemsets
827   0.027835  (1787583, 1787079)
828   0.010031  (1764609, 1787583)
829   0.011134   (1764609, 464331)
830   0.013140   (1764609, 464352)
831   0.011936    (475293, 480165)
...        ...                 ...
2228  0.011886    (489195, 489174)
2229  0.010532    (489174, 489216)
2230  0.011886    (490476, 490455)
2231  0.013591    (464352, 490455)
2232  0.010933    (492219, 480165)

[1406 rows x 2 columns]


  and should_run_async(code)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def contains_frequent_pattern(transaction, frequent_patterns):
    for pattern in frequent_patterns['itemsets']:  
        if set(pattern).issubset(set(transaction)):  # 此交易含frequent itemset
            return True  
    return False  

# 標記交易是否包含frequent itemset
# 交易含 frequent pattern 為 1，否則為 0
def add_target_column(df_trans, transactions, frequent_patterns):
    target_values = [] 
    for trans in transactions: 
        target_values.append(contains_frequent_pattern(trans, frequent_patterns))  # 加入到列表中
    df_trans["target"] = target_values 
    return df_trans

df_trans = add_target_column(df_trans, transactions, frequent_patterns) 

# feature engineering
df_trans["total_items"] = df_trans.sum(axis=1) 

X = df_trans.drop(columns=["target"])
y = df_trans["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  and should_run_async(code)


In [23]:
# print(df_trans["total_items"])
# print(df_trans["target"])


  and should_run_async(code)


In [None]:

decisionTree = DecisionTreeClassifier(max_depth=15, random_state=42) # 0.8528084252758275
# decisionTree = DecisionTreeClassifier(max_depth=10, min_samples_split=3, min_samples_leaf=5, random_state=42) # 0.844282848545637# 0.7231695085255767
# DecisionTree = DecisionTreeClassifier(max_depth=5, random_state=42) # 0.6908224674022067
decisionTree.fit(X_train, y_train)

y_pred_decisionTree = decisionTree.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_decisionTree))

  and should_run_async(code)


Decision Tree Accuracy: 0.8528084252758275


In [None]:
import pandas as pd

df = pd.read_csv("./transaction_data.csv")

item_lookup = df[['ItemCode', 'ItemDescription']].drop_duplicates().set_index('ItemCode')['ItemDescription'].to_dict()

def convert_to_description(itemset):
    return [item_lookup.get(int(item), f"ItemCode {item} Not Found") for item in itemset]

# 把 Frequent Patterns 轉成 Description
frequent_patterns["itemsets_description"] = frequent_patterns["itemsets"].apply(convert_to_description)

# 顯示結果
print(frequent_patterns[["support", "itemsets_description"]])


  and should_run_async(code)


       support                               itemsets_description
827   0.027835                 [wrongly marked carton 22804, nan]
828   0.010031                 [nan, wrongly marked carton 22804]
829   0.011134                [nan, SCOTTIE DOG HOT WATER BOTTLE]
830   0.013140                  [nan, CHOCOLATE HOT WATER BOTTLE]
831   0.011936                                         [nan, nan]
...        ...                                                ...
2228  0.011886  [SET OF 12 MINI LOAF BAKING CASES, SET OF 6 SN...
2229  0.010532  [SET OF 6 SNACK LOAF BAKING CASES, SET OF 6 TE...
2230  0.011886  [LOVE HOT WATER BOTTLE, HOT WATER BOTTLE KEEP ...
2231  0.013591  [CHOCOLATE HOT WATER BOTTLE, HOT WATER BOTTLE ...
2232  0.010933                  [HAND WARMER RED LOVE HEART, nan]

[1406 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frequent_patterns["itemsets_description"] = frequent_patterns["itemsets"].apply(convert_to_description)


In [12]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
randomForest.fit(X_train, y_train)

y_pred_randomForest = randomForest.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_randomForest)) #0.6915747241725175


  and should_run_async(code)


Random Forest Accuracy: 0.813691073219659


In [20]:
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
clf.fit(X_train, y_train)


  and should_run_async(code)


ImportError: cannot import name 'Float32Dtype' from 'pandas' (c:\Users\zoezo\anaconda3\lib\site-packages\pandas\__init__.py)

In [18]:
! pip install xgboost

  and should_run_async(code)


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4
