In [1]:
from collections import defaultdict
import math

# 最大嫡模型

## 模型訓練
- 初始化參數
- 計算特徵期望
- 設定收斂條件

## 模型預測
- 計算特徵權重
- 計算條件機率

In [2]:
class MaxEnt(object):
    def __init__(self):
        self.feats = defaultdict(int)
        self.trainset = [] # 訓練集
        self.labels = set() # 標籤集
        
    def load_data(self, file):
        for line in open(file):
            fields = line.strip().split()
            if len(fields) <2: continue # 特徵數要大於兩列
            label = fields[0] # 默認第一列式是標籤
            self.labels.add(label)
            for f in set(fields[1:]):
                self.feats[(label,f)] += 1 # (label,f)元組是特徵 
                print(label,f)
            self.trainset.append(fields)
    
    
    # 模型訓練
    def train(self, max_iter= 1000): # 訓練樣本的主函數(迭代次數默認為＝1000次)
        self._initparams() # 初始化參數
        for i in range(max_iter):
            print('iter %d ...' % (i+1))
            self.ep = self.Ep() # 計算模型分布的特徵期望
            self.lastw = self.w[:]
            for i, win in enumerate(self.w):
                delta = 1.0/self.M * math.log(self.ep_[i]/ self.ep[i])
                self.w[i] += delta # 更新 w
            print(self.w, self.feats)
            if self._convergence(self.lastw, self.w): # 判斷算法是否收斂
                break    
    
    def _initparams(self): # 初始化參數
        self.size = len(self.trainset)
        self.M = max([len(record) -1 for record in self.trainset]) # GIS訓練算法 -> train()當中的M參數 
        
        self.ep_ = [0,0]*len(self.feats)
        for i, f in enumerate(self.feats):
            self.ep_[i] = float(self.feats[f])/float(self.size) # 計算經驗分布的特徵期望
            self.feats[f] = i # 為每個特徵函數分配 id
            
        self.w = [0.0]*len(self.feats) # 初始化權重
        self.lastw = self.w
        
        
   
    def Ep(self): # 計算模型分布的特徵期望值
        ep = [0.0]*len(self.feats)
        for record in self.trainset: # 從訓練集中迭代輸出特徵
            features = record[1:]
            prob = self.calprob(features) # 計算條件機率 P(y|x)
            for f in features:
                for w,l in prob:
                    if (l,f) in self.feats: # 來自訓練數據的特徵
                        idx = self.feats[(l,f)] # 獲取特徵id
                        ep[idx] += w* (1.0/self.size) # sum(1/N * f(y,x)*p(y|x)), p(x) = 1/N
        return ep
    
    def _convergence(self,lastw,w): # 收斂-終止的條件
        for w1, w2 in zip(lastw,w): 
            if abs(w1-w2) >= 0.01: return False
        return True
    

    # 模型預測
    def predict(self, input): # 預測函數
        features = input.strip().split()
        prob = self.calprob(features) 
        prob.sort(reverse = True)
        return prob
    
    def probwgt(self, features, label): # 計算每個特徵權重的指數
        wgt = 0.0
        for f in features:
            if (label,f) in self.feats:
                wgt += self.w[self.feats[(label,f)]]
        return math.exp(wgt)    
    
    def calprob(self, features): # 計算條件機率
        wgts = [(self.probwgt(features,l),l) for l in self.labels]
        Z = sum([w for w,l in wgts]) # 歸一化參數
        prob = [(w/Z,l) for w, l in wgts] # 機率向量
        return prob
    
    
           

In [3]:
# import maxent
model = MaxEnt()
model.load_data('data.txt') # 導入數據集
model.train() # 訓練模型

Outdoor Happy
Outdoor Sunny
Outdoor Dry
Outdoor Happy
Outdoor Sunny
Outdoor Happy
Outdoor Humid
Outdoor Sunny
Outdoor Dry
Outdoor Sad
Outdoor Sunny
Outdoor Sad
Outdoor Humid
Outdoor Sunny
Outdoor Happy
Outdoor Cloudy
Outdoor Humid
Outdoor Happy
Outdoor Cloudy
Outdoor Humid
Outdoor Sad
Outdoor Cloudy
Outdoor Humid
Outdoor Sad
Outdoor Cloudy
Outdoor Humid
Indoor Rainy
Indoor Dry
Indoor Happy
Indoor Rainy
Indoor Dry
Indoor Sad
Indoor Rainy
Indoor Sad
Indoor Humid
Indoor Sad
Indoor Cloudy
Indoor Humid
Indoor Sad
Indoor Cloudy
Indoor Humid
iter 1 ...
[0.17027520792199694, 0.2310490601866485, 0.0, 0.09589402415059373, 7.401486830834375e-17, 0.09589402415059367, 0.23104906018664842, 0.0, -0.36620409622270317, 7.401486830834375e-17, -0.1351550360360547, -0.13515503603605475] defaultdict(<class 'int'>, {('Outdoor', 'Happy'): 0, ('Outdoor', 'Sunny'): 1, ('Outdoor', 'Dry'): 2, ('Outdoor', 'Humid'): 3, ('Outdoor', 'Sad'): 4, ('Outdoor', 'Cloudy'): 5, ('Indoor', 'Rainy'): 6, ('Indoor', 'Dry'): 7, (

In [4]:
print(model.predict('Rainy Happy Dry'))

[(0.9270055942186617, 'Indoor'), (0.07299440578133828, 'Outdoor')]


# 特徵組合

In [5]:
# 將資料攤平
train = []
for line in open('data.txt'):    
    field = line.strip().split()
    if len(field) <2: continue
    label = field[0]
    
    for i,j in enumerate(field[1:],1):
        a = label,j
        train.append(a)
    

In [6]:
import pandas as pd
test = pd.DataFrame(train)
group = test.groupby([0,1]).size().reset_index(name='count')
group.sort_values(ascending= False, by=['count'])

Unnamed: 0,0,1,count
9,Outdoor,Humid,6
8,Outdoor,Happy,5
11,Outdoor,Sunny,5
5,Indoor,Sad,4
6,Outdoor,Cloudy,4
10,Outdoor,Sad,4
3,Indoor,Humid,3
4,Indoor,Rainy,3
0,Indoor,Cloudy,2
1,Indoor,Dry,2


# enumerate 用法舉例

In [7]:
d = ['Spring', 'Summer', 'Fall', 'Winter']
for i, j in enumerate(d,1):
    print(i, j)

1 Spring
2 Summer
3 Fall
4 Winter


In [8]:
d = ['Spring', 'Summer', 'Fall', 'Winter']
for i, j in enumerate(d):
    print(i,j)

0 Spring
1 Summer
2 Fall
3 Winter


In [9]:
# 只輸出 index
d = ['Spring', 'Summer', 'Fall', 'Winter']
for i, j in enumerate(d):
    print(i)

0
1
2
3


In [10]:
# 只輸出 value
d = ['Spring', 'Summer', 'Fall', 'Winter']
for i, j in enumerate(d):
    print(j)

Spring
Summer
Fall
Winter
