# 作業5: Decision Tree
南台科技大學 碩士班 機器學習 作業

|||
|:---:|:---:|
|學生|陳聖文|
|學號|MB1G0110|
|班級|碩研資工一甲|

## 匯入套件庫

*   math.log2
*   pandas

In [153]:
from math import log2
from collections import Counter
import pandas as pd

## 建立資料表
建立是否購買筆電的資料表，在最後面添加上自己的一筆資料，共12筆

In [154]:
df = pd.DataFrame(
    data=[
      ["<=30", "高", False, False],
      ["31...40", "高", False, True],
      [">40", "中", False, True],
      [">40", "低", True, False],
      ["31...40", "低", True, True],
      ['<=30', "中", False, False],
      ['<=30', "低", True, True],
      ['<=30', "中", True, True],
      ['31...40', "中", False, True],
      ['31...40', "高", True, True],
      ['>40', "中", True, False],
      ['<=30', "低", True, True] ## 添加自己的資料
    ],
    columns = ["年紀", "收入", "是否為學生", "購買筆電與否"]
)

## 定義公式

### 熵計算公式

$$ \text{Entropy(S)} = -p_+\log_2p_+-p_-\log_2p_- $$

In [155]:
def entropy(s, precision=3):
  p_positive = s[0] / sum(s)
  p_negative = s[1] / sum(s)

  if p_positive == 0 or p_negative == 0:
    return 0

  return round(
      - p_positive * log2(p_positive) - p_negative * log2(p_negative),
      precision)

In [156]:
def multi_to_entropy(s, a, precision=3):
  result = 0
  for item in a:
    result += sum(item) / sum(s) * entropy(item)
  
  return round(result, precision)

### Gain 公式

$$ \text{Gain(S, A)} = \text{Entropy(S)} - \sum_{v\in\text{value(A)}} |\frac{\text{S}_\text{v}}{\text{S}}|\text{Entropy(S}_\text{v}\text{)} $$

In [157]:
def gain(s, a, precision=3):
  entropy_s = entropy(s)
  for item in a:
    entropy_s -= (sum(item) / sum(s)) * entropy(item)
  
  return round(entropy_s, precision)

In [158]:
print(gain([29, 35], [[21, 5], [8, 30]]))

0.267


## 計算決策樹

### 消費者是否購買筆電

In [159]:
afford = Counter(df["購買筆電與否"])
afford_S = [afford[True], afford[False]]

show_df = pd.DataFrame(
    data=[
        ["是", afford_S[0], round(afford_S[0] / sum(afford_S), 2)],
        ["否", afford_S[1], round(afford_S[1] / sum(afford_S), 2)]
    ],
    columns=["購買筆電與否", "出現次數", "出現機率"]
)
afford_E = entropy(afford_S)
print("熵:", afford_E)
show_df

熵: 0.918


Unnamed: 0,購買筆電與否,出現次數,出現機率
0,是,8,0.67
1,否,4,0.33


### 計算收入特徵值下是否購買筆電

In [160]:
incomes_true_high = len(df[(df["收入"] == "高") & (df['購買筆電與否'] == True)])
incomes_false_high = len(df[(df["收入"] == "高") & (df['購買筆電與否'] == False)])
incomes_true_mid = len(df[(df["收入"] == "中") & (df['購買筆電與否'] == True)])
incomes_false_mid = len(df[(df["收入"] == "中") & (df['購買筆電與否'] == False)])
incomes_true_low = len(df[(df["收入"] == "低") & (df['購買筆電與否'] == True)])
incomes_false_low = len(df[(df["收入"] == "低") & (df['購買筆電與否'] == False)])
incomes = [
    [incomes_true_high, incomes_false_high],
    [incomes_true_mid, incomes_false_mid], 
    [incomes_true_low, incomes_false_low]
]

show_df = pd.DataFrame(
    data=[
        ["高", incomes[0][0], incomes[0][1]],
        ["中", incomes[1][0], incomes[1][1]],
        ["低", incomes[2][0], incomes[2][1]]
    ],
    columns=["收入", "購買", "不購買"]
)
incomes_E = multi_to_entropy(afford_S, incomes)
print("熵:", incomes_E)
show_df

熵: 0.904


Unnamed: 0,收入,購買,不購買
0,高,2,1
1,中,3,2
2,低,3,1


### 計算學生特徵值下是否購買筆電

In [161]:
student_true_positive = len(df[(df['是否為學生'] == True) & (df['購買筆電與否'] == True)])
student_false_positive = len(df[(df['是否為學生'] == True) & (df['購買筆電與否'] == False)])
student_true_negative = len(df[(df['是否為學生'] == False) & (df['購買筆電與否'] == True)])
student_false_negative = len(df[(df['是否為學生'] == False) & (df['購買筆電與否'] == False)])

student_true = [student_true_positive, student_true_negative]
student_false = [student_false_positive, student_false_negative]
studnets = [student_true, student_false]

show_df = pd.DataFrame(
    data=[
        ["是", studnets[0][0], studnets[0][1]],
        ["否", studnets[1][0], studnets[1][1]]
    ],
    columns=["學生", "購買", "不購買"]
)
students_E = multi_to_entropy(afford_S, studnets)
print("熵:", students_E)
show_df

熵: 0.969


Unnamed: 0,學生,購買,不購買
0,是,5,3
1,否,2,2


### 計算年紀特徵值下是否購買筆電

In [162]:
age_less_30_true = len(df[(df['年紀'] == "<=30") & (df['購買筆電與否'] == True)])
age_less_30_false = len(df[(df['年紀'] == "<=30") & (df['購買筆電與否'] == False)])
age_3140_true = len(df[(df['年紀'] == "31...40") & (df['購買筆電與否'] == True)])
age_3140_false = len(df[(df['年紀'] == "31...40") & (df['購買筆電與否'] == False)])
age_large_40_true = len(df[(df['年紀'] == ">40") & (df['購買筆電與否'] == True)])
age_large_40_false = len(df[(df['年紀'] == ">40") & (df['購買筆電與否'] == False)])

ages = [
    [age_less_30_true, age_less_30_false],
    [age_3140_true, age_3140_false],
    [age_large_40_true, age_large_40_false]
]
studnets = [student_true, student_false]

show_df = pd.DataFrame(
    data=[
        ["<=30", ages[0][0], ages[0][1]],
        ["31...40", ages[1][0], ages[1][1]],
        [">40", ages[2][0], ages[2][1]]
    ],
    columns=["年紀", "購買", "不購買"]
)
ages_E = multi_to_entropy(afford_S, ages)
print("熵:", ages_E)
show_df

熵: 0.634


Unnamed: 0,年紀,購買,不購買
0,<=30,3,2
1,31...40,4,0
2,>40,1,2


## 計算資訊獲利

In [163]:
info_gain = pd.DataFrame(
    data=[
        ["收入", afford_E - incomes_E],
        ["年紀", afford_E - ages_E],
        ["是否為學生", afford_E - students_E],
    ],
    columns=["特徵值", "資訊獲利"]
)
info_gain

Unnamed: 0,特徵值,資訊獲利
0,收入,0.014
1,年紀,0.284
2,是否為學生,-0.051
