In [1]:
import pandas as pd 
import numpy as np

## 1. 数据读取

In [2]:
train_df = pd.read_csv('./data/train.csv', sep='\t')
test_df = pd.read_csv('./data/test.csv', sep='\t')
len(train_df), len(test_df)

(57039, 11208)

In [3]:
train_df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,votes_up,votes_all,label
0,7885,3901,"First off, allow me to correct a common mistak...",5.0,6,7,0
1,52087,47978,I am really troubled by this Story and Enterta...,3.0,99,134,0
2,5701,3667,A near-perfect film version of a downright glo...,4.0,14,14,1
3,47191,40892,Keep your expectations low. Really really low...,1.0,4,7,0
4,40957,15367,"""they dont make em like this no more...""well.....",5.0,3,6,0


In [4]:
test_df.head()

Unnamed: 0,Id,reviewerID,asin,reviewText,overall
0,0,82947,37386,I REALLY wanted this series but I am in SHOCK ...,1.0
1,1,10154,23543,I have to say that this is a work of art for m...,4.0
2,2,5789,5724,Alien 3 is certainly the most controversal fil...,3.0
3,3,9198,5909,"I love this film...preachy? Well, of course i...",5.0
4,4,33252,21214,Even though I previously bought the Gamera Dou...,5.0


## 2. 特征提取

In [5]:
import scipy
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# tf/idf 处理文本特征
word_model = TfidfVectorizer(stop_words='english')
train_X = word_model.fit_transform(train_df['reviewText'])
test_X = word_model.transform(test_df['reviewText']) 

# 拼上总评分特征
train_X = scipy.sparse.hstack([train_X, train_df['overall'].values.reshape((-1, 1)) / 5])
test_X = scipy.sparse.hstack([test_X, test_df['overall'].values.reshape((-1, 1)) / 5])

## 3. Ensemble 算法实现

In [6]:
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV

def construct_clf(clf_name):
    clf = None
    if clf_name == 'SVM':
        clf = svm.LinearSVC()
    elif clf_name == 'DTree' :
        clf = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
    elif clf_name == 'NB' :
        clf = BernoulliNB()
    clf = CalibratedClassifierCV(clf, cv=2, method='sigmoid')  # 概率校正
    return clf

In [7]:
class Bagging(object):
    def __init__(self, clf, num_iter):
        self.clf = clf  # 分类器对象
        self.num_iter = num_iter  # Bagging 的分类器个数
        
    def fit_predict(self, X, Y, test_X):
        result = np.zeros(test_X.shape[0])  # 记录测试集的预测结果
        train_idx = np.arange(len(Y))
        for i in range(self.num_iter):
            sample_idx = np.random.choice(train_idx, size=len(Y), replace=True)  # Bootstrap
            sample_train_X = X[sample_idx]
            sample_train_Y = Y[sample_idx]
            self.clf.fit(sample_train_X, sample_train_Y)
            print('Model {:>2d} finish!'.format(i))
            predict_proba = self.clf.predict_proba(test_X)[:, 1]
            result += predict_proba  # 累加不同分类器的预测概率
        result /= self.num_iter  # 取平均（投票）
        return result

In [8]:
class AdaBoostM1(object):
    def __init__(self, clf, num_iter):
        self.clf = clf  # 分类器对象
        self.num_iter = num_iter  # 迭代次数
        
    def fit_predict(self, X, Y, test_X):
        result_lst, beta_lst = list(), list()  # 记录每次迭代的预测结果和投票权重
        num_samples = len(Y)
        weight = np.ones(num_samples)  # 样本权重，注意总和应为 num_samples
        for i in range(self.num_iter):
            self.clf.fit(X, Y, sample_weight=weight)  # 带权重的 fit
            print('Model {:<2d} finish!'.format(i))
            train_predict = self.clf.predict(X)  # 训练集预测结果
            error_flag = train_predict != Y  # 预测错误的位置
            error = weight[error_flag].sum() / num_samples  # 计算错误率
            if error > 0.5:
                break
            beta = error / (1 - error)
            weight *= (1.0 - error_flag) * beta + error_flag  # 调整权重，正确位置乘上 beta，错误位置还是原来的
            weight /= weight.sum() / num_samples  # 归一化，让权重和等于 num_samples
            beta_lst.append(beta)
            predict_proba = self.clf.predict_proba(test_X)[:, 1]
            result_lst.append(predict_proba)
        beta_lst = np.log(1 / np.array(beta_lst))
        beta_lst /= beta_lst.sum()  # 归一化投票权重
        print('\nVote Weight:\n', beta_lst)
        result = (np.array(result_lst) * beta_lst[:, None]).sum(0)  # 每一轮的预测结果加权求和
        return result

## 4. 测试并生成结果

In [9]:
np.random.seed(0)
clf = construct_clf('SVM')  # DTree, SVM, NB
# runner = Bagging(clf, 10)
runner = AdaBoostM1(clf, 10)
y_predict = runner.fit_predict(train_X.tocsr(), train_df['label'], test_X.tocsr())

Model 0  finish!
Model 1  finish!
Model 2  finish!
Model 3  finish!

Vote Weight:
 [0.47013022 0.35834806 0.17152172]


In [10]:
# 生成提交文件
result_df = pd.DataFrame()
result_df['Id'] = test_df['Id'].values
result_df['Predicted'] = y_predict
result_df.to_csv('./result.csv', index=False)

|   Method   |   Base   |   +Bagging   |   +AdaBoost.M1   |
| ---- | ---- | ---- | ---- |
|   DTree   |   0.74   |   0.77   |   0.76   |
|   SVM   |   0.78   |   0.81   |   0.81   |

## 5. 讨论

* 随机数种子
* 自行构建验证集进行线下测试
* 训练集上 fit，和测试集一起 transform
* 文本建模方法 or 文本特征选择
* 引入文本长度、用户商品等特征
* 基分类器的强弱