In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import math
from math import radians, cos, sin, asin, sqrt
import sys
import matplotlib.pyplot as plt
import time, datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score 
from functools import reduce
from sklearn import cross_validation,metrics
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor



In [4]:
# 预处理
raw_data = pd.read_csv('../raw_data/trade_new.csv', usecols=['sldatime', 'vipno', 'pluno', 'bndno', 'dptno', 'qty', 'amt'])
raw_data['Month'] = [int(x[5] + x[6]) for x in raw_data['sldatime']]
raw_data['Day'] = [int(x[8] + x[9]) for x in raw_data['sldatime']]
raw_data.rename(columns={'vipno':'User', 'pluno': 'Item', 'bndno':'Brand', 'dptno':'Cat'}, inplace = True)
where_are_nan = np.isnan(raw_data['Brand'])
raw_data['Brand'][where_are_nan] = -1
# 对于<1的次数按照1来算
raw_data['qty'] = [math.ceil(x) for x in raw_data['qty']]

# 所有ID以及组合对
User_total = list(set(raw_data['User']))
Item_total = list(set(raw_data['Item']))
Brand_total = list(set(raw_data['Brand']))
Cat_total = list(set(raw_data['Cat']))
UI_total = [(m, n) for m in User_total for n in Item_total]
UB_total = [(m, n) for m in User_total for n in Brand_total]
UC_total = [(m, n) for m in User_total for n in Cat_total]
BC_total = [(m, n) for m in Brand_total for n in Cat_total]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [5]:
# 返回一个月或whole period的特征数据
def get_feature_per_month(month, is_whole):
    
    global raw_data, User_total, Item_total, Brand_total, Cat_total, UI_total, UB_total, UC_total, BC_total
    
    # 特征 dict 初始化
    def init(keys):
        dic = {}
        for key in keys:
            dic[key] = []
        return dic
    
    # 主体为U的特征
    U_feature = init(User_total)
    # 主体为I的特征
    I_feature = init(Item_total)
    # 主体为B的特征
    B_feature = init(Item_total)
    # 主体为C的特征
    C_feature = init(Cat_total)
    # 主体为UI的特征
    UI_feature = init(UI_total)
    # 主体为UB的特征
    UB_feature = init(UB_total)
    # 主体为UC的特征
    UC_feature = init(UC_total)
    # 主体为BC的特征
    BC_feature = init(BC_total)
    
    t1 = {
        'Item': I_feature,
        'Brand': B_feature,
        'Cat': C_feature
    }
    
    t2 = {
        'Item': UI_feature,
        'Brand': UB_feature,
        'Cat': UC_feature
    }
    
    # 某个月的数据
    # 若is_whole为True，则表示whole period
    data = pd.DataFrame()
    if(is_whole == False):
        data = raw_data[raw_data['Month'] == month]
    else:
        data = raw_data[(raw_data['Month'] < month) & (raw_data['Month'] >= month - 3)]
        
    # TYPE1
    # 1.1 次数
    # {U} 0
    res = data['qty'].groupby(data['User']).sum()
    # 将求和结果用 dict 存储
    temp = {}
    for i in range(len(res)):
        temp[res.index[i]] = res.values[i]
    # 放入特征集中
    for key2 in U_feature:
        if (key2 in temp):
            U_feature[key2].append(temp[key2])
        else:
            U_feature[key2].append(0)
    
    # {I, B, C} 0, 0, 0
    for key in t1:
        res = data['qty'].groupby(data[key]).sum()
        # 将求和结果用 dict 存储
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i] 
        # 放入特征集中
        for key2 in t1[key]:
            if (key2 in temp):
                t1[key][key2].append(temp[key2])
            else:
                t1[key][key2].append(0)
    
    # {UI, UB, UC} 0
    for key in t2:
        res = data['qty'].groupby([data['User'], data[key]]).sum()
        # 将求和结果用 dict 存储
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i] 
        # 放入特征集中
        for key2 in t2[key]:
            if (key2 in temp):
                t2[key][key2].append(temp[key2])
            else:
                t2[key][key2].append(0)
    
        
    # 1.2 金额
    # {U} 1
    res = data['amt'].groupby(data['User']).sum()
    # 将求和结果用 dict 存储
    temp = {}
    for i in range(len(res)):
        temp[res.index[i]] = res.values[i]
    # 放入特征集中
    for key2 in U_feature:
        if (key2 in temp):
            U_feature[key2].append(temp[key2])
        else:
            U_feature[key2].append(0)
    
    # {I, B, C} 1, 1, 1
    for key in t1:
        res = data['amt'].groupby(data[key]).sum()
        # 将求和结果用 dict 存储
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i] 
        # 放入特征集中
        for key2 in t1[key]:
            if (key2 in temp):
                t1[key][key2].append(temp[key2])
            else:
                t1[key][key2].append(0)
    
    # {UI, UB, UC} 1, 1, 1
    for key in t2:
        res = data['amt'].groupby([data['User'], data[key]]).sum()
        # 将求和结果用 dict 存储
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i]
        # 放入特征集中
        for key2 in t2[key]:
            if (key2 in temp):
                t2[key][key2].append(temp[key2])
            else:
                t2[key][key2].append(0)
                
                
    # 1.3.1 购买发生天数 
    # {U} 2
    temp_data = data[['User', 'Month', 'Day']].drop_duplicates()
    res = temp_data.groupby(temp_data['User']).count()
    temp = {}
    for i in range(len(res)):
        temp[res.index[i]] = res['Day'].values[i]
    # 放入特征集中
    for key in U_feature:
        if (key in temp):
            U_feature[key].append(temp[key])
        else:
            U_feature[key].append(0)
            
    # 1.3.2 被购买发生天数
    # {I, B, C} 2, 2, 2
    for key in t1:
        temp_data = data[[key, 'Month', 'Day']].drop_duplicates()
        res = temp_data.groupby(temp_data[key]).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res['Day'].values[i]
        # 放入特征集中
        for key2 in t1[key]:
            if (key2 in temp):
                t1[key][key2].append(temp[key2])
            else:
                t1[key][key2].append(0)
                
    # 1.3.3
    # {UI, UB, UC} 2, 2, 2
    for key in t2:
        temp_data = data[['User', key, 'Month', 'Day']].drop_duplicates()
        res = data['amt'].groupby([data['User'], data[key]]).count()
        # 将求和结果用 dict 存储
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i]
        # 放入特征集中
        for key2 in t2[key]:
            if (key2 in temp):
                t2[key][key2].append(temp[key2])
            else:
                t2[key][key2].append(0)
    

    # 1.4 购买的unique的{I, B, C}的数量
    # {U} 5
    for key in t2:
        temp_data = data[['User', key]].drop_duplicates()
        res = temp_data.groupby(temp_data['User']).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i][0] 
        # 放入特征集中
        for key2 in U_feature:
            if (key2 in temp):
                U_feature[key2].append(temp[key2])
            else:
                U_feature[key2].append(0)

    # 1.5 包含的unique的I的数量
    # {B} 3
    temp_data = data[['Brand', 'Item']].drop_duplicates()
    res = temp_data.groupby(temp_data['Brand']).count()
    temp = {}
    for i in range(len(res)):
        temp[res.index[i]] = res.values[i][0] 
    # 放入特征集中
    for key2 in B_feature:
        if (key2 in temp):
            B_feature[key2].append(temp[key2])
        else:
            B_feature[key2].append(0)
    
    # {C} 3
    temp_data = data[['Cat', 'Item']].drop_duplicates()
    res = temp_data.groupby(temp_data['Cat']).count()
    temp = {}
    for i in range(len(res)):
        temp[res.index[i]] = res.values[i][0] 
    # 放入特征集中
    for key2 in C_feature:
        if (key2 in temp):
            C_feature[key2].append(temp[key2])
        else:
            C_feature[key2].append(0)
     
    # 1.6 买过该品牌或品类或商品的不同的买家的人数
    # {B, C, I} 4, 4, 3
    for key in t1:
        temp_data = data[[key, 'User']].drop_duplicates()
        res = temp_data.groupby(temp_data[key]).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = res.values[i][0]
        # 放入特征集中
        for key2 in t1[key]:
            if (key2 in temp):
                t1[key][key2].append(temp[key2])
            else:
                t1[key][key2].append(0)
    
           
    return {
        'U_feature': U_feature,
        'I_feature': I_feature,
        'B_feature': B_feature,
        'C_feature': C_feature,
        'UI_feature': UI_feature,
        'UB_feature': UB_feature,
        'UC_feature': UC_feature,
        'BC_feature': BC_feature
    }

In [6]:
def get_feature(pred_month):
    
    global raw_data, User_total, Item_total, Brand_total, Cat_total, UI_total, UB_total, UC_total, BC_total
    
    # 特征 dict 初始化
    def init(keys):
        dic = {}
        for key in keys:
            dic[key] = []
        return dic
    
    # 主体为U的特征
    U_feature = init(User_total)
    # 主体为I的特征
    I_feature = init(Item_total)
    # 主体为B的特征
    B_feature = init(Item_total)
    # 主体为C的特征
    C_feature = init(Cat_total)
    # 主体为UI的特征
    UI_feature = init(UI_total)
    # 主体为UB的特征
    UB_feature = init(UB_total)
    # 主体为UC的特征
    UC_feature = init(UC_total)
    # 主体为BC的特征
    BC_feature = init(BC_total)
    
    features = {
        'U_feature': U_feature,
        'I_feature': I_feature,
        'B_feature': B_feature,
        'C_feature': C_feature,
        'UI_feature': UI_feature,
        'UB_feature': UB_feature,
        'UC_feature': UC_feature,
        'BC_feature': BC_feature
    }
    
    # 前 3、2、1 个月
    feature_per_month_1 = get_feature_per_month(pred_month - 3, False)
    feature_per_month_2 = get_feature_per_month(pred_month - 2, False)
    feature_per_month_3 = get_feature_per_month(pred_month - 1, False)
    # whole_period
    feature_whole = get_feature_per_month(pred_month, True)    
    
    # 2.1 所有的 monthly 特征，都可以进行此 aggregation agg 操作包含mean、std、max、median
    for key in features:
        for ikey in features[key]:
            features[key][ikey] = feature_per_month_1[key][ikey] + feature_per_month_2[key][ikey] + feature_per_month_3[key][ikey] + feature_whole[key][ikey]
            temp = list(zip(feature_per_month_1[key][ikey], feature_per_month_2[key][ikey], feature_per_month_3[key][ikey]))
            temp = [np.array(x) for x in temp]
            # mean
            features[key][ikey] += [x.mean() for x in temp]
            # std
            features[key][ikey] += [np.std(x) for x in temp]
            # max
            features[key][ikey] += [np.max(x) for x in temp]
            # median
            features[key][ikey] += [np.median(x) for x in temp]
            
    return features

In [7]:
# 随机选取80%的数据记录作为训练集，余下20%作为测试集合
def data_train_test_split(X, y, random_seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
    return X_train, X_test, y_train, y_test

In [8]:
def a():
    
    features = get_feature(5)
    
    return features

In [9]:
features = a()

In [12]:
features

{'BC_feature': {(30721.0, 30723): [],
  (30721.0, 14340): [],
  (30721.0, 14342): [],
  (30721.0, 30727): [],
  (30721.0, 30728): [],
  (30721.0, 34831): [],
  (30721.0, 14351): [],
  (30721.0, 14352): [],
  (30721.0, 14362): [],
  (30721.0, 14366): [],
  (30721.0, 32821): [],
  (30721.0, 32823): [],
  (30721.0, 10300): [],
  (30721.0, 14400): [],
  (30721.0, 14401): [],
  (30721.0, 14402): [],
  (30721.0, 14403): [],
  (30721.0, 11004): [],
  (30721.0, 14405): [],
  (30721.0, 14406): [],
  (30721.0, 14407): [],
  (30721.0, 22600): [],
  (30721.0, 14409): [],
  (30721.0, 10310): [],
  (30721.0, 10309): [],
  (30721.0, 22601): [],
  (30721.0, 22604): [],
  (30721.0, 22603): [],
  (30721.0, 10319): [],
  (30721.0, 30800): [],
  (30721.0, 22602): [],
  (30721.0, 22610): [],
  (30721.0, 30803): [],
  (30721.0, 30804): [],
  (30721.0, 34900): [],
  (30721.0, 30810): [],
  (30721.0, 30812): [],
  (30721.0, 22621): [],
  (30721.0, 22620): [],
  (30721.0, 22622): [],
  (30721.0, 30816): [],
  

In [13]:
new_features = get_feature(8)

In [17]:
new_features

{'BC_feature': {(30721.0, 30723): [],
  (30721.0, 14340): [],
  (30721.0, 14342): [],
  (30721.0, 30727): [],
  (30721.0, 30728): [],
  (30721.0, 34831): [],
  (30721.0, 14351): [],
  (30721.0, 14352): [],
  (30721.0, 14362): [],
  (30721.0, 14366): [],
  (30721.0, 32821): [],
  (30721.0, 32823): [],
  (30721.0, 10300): [],
  (30721.0, 14400): [],
  (30721.0, 14401): [],
  (30721.0, 14402): [],
  (30721.0, 14403): [],
  (30721.0, 11004): [],
  (30721.0, 14405): [],
  (30721.0, 14406): [],
  (30721.0, 14407): [],
  (30721.0, 22600): [],
  (30721.0, 14409): [],
  (30721.0, 10310): [],
  (30721.0, 10309): [],
  (30721.0, 22601): [],
  (30721.0, 22604): [],
  (30721.0, 22603): [],
  (30721.0, 10319): [],
  (30721.0, 30800): [],
  (30721.0, 22602): [],
  (30721.0, 22610): [],
  (30721.0, 30803): [],
  (30721.0, 30804): [],
  (30721.0, 34900): [],
  (30721.0, 30810): [],
  (30721.0, 30812): [],
  (30721.0, 22621): [],
  (30721.0, 22620): [],
  (30721.0, 22622): [],
  (30721.0, 30816): [],
  

In [14]:
def get_test_label(month, tradeType):
    
    global raw_data, User_total, Item_total, Brand_total, Cat_total, UI_total, UB_total, UC_total, BC_total
    
    # dict 初始化
    def init(keys):
        dic = {}
        for key in keys:
            dic[key] = []
        return dic
    
    # tradeType: 1为Item，2为Brand，3为Cat, 0为是否购买   
    data = raw_data[raw_data['Month'] == month]
    
    if(tradeType == 0):
        U_label = init(User_total)
        res =  data.groupby(data['User']).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = True
        # 放入 label dict 中
        for key in U_label:
            if (key in temp):
                U_label[key] = True
            else:
                U_label[key] = False
                
        return U_label
    
    if(tradeType == 1):
        UI_label = init(UI_total)
        res =  data.groupby([data['User'], data['Item']]).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = True
        # 放入 label dict 中
        for key in UI_label:
            if (key in temp):
                UI_label[key] = True
            else:
                UI_label[key] = False
                
        return UI_label
    
    elif(tradeType == 2):
        UB_label = init(UB_total)
        res =  data.groupby([data['User'], data['Brand']]).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = True
        # 放入 label dict 中
        for key in UB_label:
            if (key in temp):
                UB_label[key] = True
            else:
                UB_label[key] = False
                
        return UB_label
                
    if(tradeType == 3):
        UC_label = init(UC_total)
        res =  data.groupby([data['User'], data['Cat']]).count()
        temp = {}
        for i in range(len(res)):
            temp[res.index[i]] = True
        # 放入 label dict 中
        for key in UC_label:
            if (key in temp):
                UC_label[key] = True
            else:
                UC_label[key] = False
                
        return UC_label

In [18]:
# 计算 precision，recall 和 f-measurement，对模型进行评价
def evaluate(y_pred, y_test, y_scores):
    
    evaluations = []
    
    # precision overall
    precision_overall = precision_score(y_test, y_pred, average='macro')
    evaluations.append(precision_overall)
    
    # recall overall
    recall_overall = recall_score(y_test, y_pred, average='macro')
    evaluations.append(recall_overall)

    # f-measurement overall
    f_measurement_overall = f1_score(y_test, y_pred, average='macro')
    evaluations.append(f_measurement_overall)
    
    # AUC
    auc = metrics.roc_auc_score(y_test, y_scores[:,1])
    evaluations.append(auc)
    
    return evaluations

In [19]:
# 传入不同的分类器，训练并且作出预测
def train_classifier_and_predict(classifier_model, X_train, X_test, y_train):
    clf = classifier_model
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    y_scores =  clf.predict_proba(X_test)
    return y_pred, y_scores

In [20]:
def get_result_10(classifier_model, X, y):
    
    # 用以存储10次预测结果
    y_pred_list = []
    # 用以存储10次训练时间
    time_list = []
    # 用以存储10次 precision，recall，f-measurement 和 AUC
    evaluation_list = []
    
    for i in range(0, 10):
        # 随机选取80%的数据记录作为训练集，余下20%作为测试集合，传入i作为random_seed
        X_train, X_test, y_train, y_test = data_train_test_split(X, y, i)
        # 训练并且作出预测
        start = time.clock()
        y_pred, y_scores = train_classifier_and_predict(classifier_model, X_train, X_test, y_train)
        end  = time.clock()
        delta_time = end - start
        # 存储本次时间
        time_list.append(delta_time)
        # 存储本次预测结果
        y_pred_list.append(list(y_pred))
        # 存储本次 precision，recall 和 f-measurement
        evaluation_list.append(evaluate(y_pred, y_test, y_scores))

    
    # 计算平均 precision，recall 和 f-measurement
    evaluation_list = [np.array(x) for x in evaluation_list]
    def add(x, y):
        return x + y
    sum_evaluations = reduce(add, evaluation_list)
    avg_evaluations = sum_evaluations / 10
    
    # 计算平均时间
    avg_time = np.array(time_list).mean()
    
    return avg_evaluations, avg_time

In [21]:
def get_result(classifier_model, X, y):
    
    # 随机选取80%的数据记录作为训练集，余下20%作为测试集合，传入i作为random_seed
    X_train, X_test, y_train, y_test = data_train_test_split(X, y, 1)
    # 训练并且作出预测
    start = time.clock()
    y_pred, y_scores = train_classifier_and_predict(classifier_model, X_train, X_test, y_train)
    end  = time.clock()
    delta_time = end - start
    # 存储本次时间
    time = delta_time
    # 存储本次 precision，recall 和 f-measurement
    evaluation = evaluate(y_pred, y_test, y_scores)
    
    return evaluation, time

In [22]:
# 绘制 precision，recall 和 f-measurement 图
def evaluations_plot(classifier_models, model_evaluations):
    plt.style.use('ggplot')
    xticks = np.arange(len(classifier_models))
    plt.xticks(xticks, classifier_models.keys(), rotation=20)
    markers = ['v', '+', 'o', '^']

    evaluations = []
    precision = []
    recall = []
    f_measurement = []
    auc = []
    
    for key in model_evaluations:
            precision.append(model_evaluations[key][0])
            recall.append(model_evaluations[key][1])
            f_measurement.append(model_evaluations[key][2])
            auc.append(model_evaluations[key][3])
            
    evaluations = [precision, recall, f_measurement, auc]

    for evaluation, marker in zip(evaluations, markers):
        plt.plot(xticks, evaluation,
                 linestyle='--', marker=marker, alpha=0.8)

    plt.title('Evaluation of Models')
    plt.xlabel('Model Names')
    plt.ylabel('Value')
    plt.legend(['precision overall', 'recall overall', 'f-measurement overall', 'auc'])
    plt.savefig('Evaluation_of_Models.png')
    plt.show()

In [23]:
# 绘制时间图
def time_plot(classifier_models, model_times):
    plt.style.use('ggplot')
    xticks = np.arange(len(classifier_models))
    plt.xticks(xticks, classifier_models.keys(), rotation=20)
    plt.plot(xticks, list(model_times.values()), marker='o', linestyle='--', alpha=0.8)

    plt.title('Time of Models')
    plt.xlabel('Model Names')
    plt.ylabel('Time')
    plt.savefig('Time_of_Models.png')
    plt.show()

In [33]:
def b():
    
    global features
    
    # 相关feature
    U_feature = features['U_feature']
    I_feature = features['I_feature']
    UI_feature = features['UI_feature']
    feature = UI_feature
    print(1)
    
    
    # 获得feature
    # for key in UI_feature:
        # feature[key] += U_feature[key[0]]
        # feature[key] += I_feature[key[1]]
    # print(2)
        
    # 获得label
    label = get_test_label(5, 1)
    print(3)
    
    X = []
    y = []
    
    for key in feature:
        item = list(key) + feature[key]
        X.append(item)
        y.append(label[key])
    print(4)
    
    # 7个分类器
    classifier_models = {
        # 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB)
        'GaussianNB': GaussianNB(),
        # K近邻分类器 KNeighborsClassifier
        'KNeighbors': KNeighborsClassifier(n_neighbors=2),
        # 决策树分类器 DecisionTreeClassifier
        'DecisionTree': tree.DecisionTreeClassifier(criterion='entropy', max_depth=50),
        # 随机森林 RandomForestClassifier
        'RandomForest': RandomForestClassifier(max_depth=3, n_estimators=45),
        # AdaBoost AdaBoostClassifier
        'AdaBoost': AdaBoostClassifier(n_estimators=150, learning_rate = 0.1),
        # Bagging meta-estimator（Bagging 元估计器）BaggingClassifier
        'Bagging': BaggingClassifier(n_estimators=40),
        # GBDT（梯度树提升）GradientBoostingClassifier
        'GradientBoosting': GradientBoostingClassifier(n_estimators=40, max_depth=10)
        
    }
    
    # 存储模型的评价
    model_evaluations = {}
    # 存储模型训练时间
    model_times = {}
    for key in classifier_models:
        avg_evaluations, avg_time = get_result(classifier_models[key], X, y)
        model_evaluations[key] = avg_evaluations
        model_times[key] = avg_time
        
        print(key)
    # 绘制 precision，recall 和 f-measurement 图
    evaluations_plot(classifier_models, model_evaluations)
    # 绘制时间图
    time_plot(classifier_models, model_times)
    
    return model_evaluations, model_times

In [None]:
model_evaluations, model_times = b()

1
2
3
4


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GaussianNB
KNeighbors
DecisionTree


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


RandomForest


In [None]:
def ci():
    global features
    
    old_feature = features['U_feature']
    old_label = get_test_label(5, 1)
    new_feature = new_features['U_feature']

    # 获得label
    label = get_test_label(5, 1)
    
     # 7个分类器
    classifier_models = {
        # 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB)
        'GaussianNB': GaussianNB(),
        # K近邻分类器 KNeighborsClassifier
        'KNeighbors': KNeighborsClassifier(n_neighbors=2),
        # 决策树分类器 DecisionTreeClassifier
        'DecisionTree': tree.DecisionTreeClassifier(criterion='entropy', max_depth=50),
        # 随机森林 RandomForestClassifier
        'RandomForest': RandomForestClassifier(max_depth=3, n_estimators=45),
        # AdaBoost AdaBoostClassifier
        'AdaBoost': AdaBoostClassifier(n_estimators=150, learning_rate = 0.1),
        # Bagging meta-estimator（Bagging 元估计器）BaggingClassifier
        'Bagging': BaggingClassifier(n_estimators=40),
        # GBDT（梯度树提升）GradientBoostingClassifier
        'GradientBoosting': GradientBoostingClassifier(n_estimators=40, max_depth=10)
        
    }
    
    X_train = []
    y_train = []
    X_pred = []
    
    for key in old_feature:
        item = list(key) + old_feature[key]
        X_train.append(item)
        y_train.append(label[key])
    
    for key in new_feature:
        item = list(key) + new_feature[key]
        X_pred.append(item)
    
    for key in classifier_models:
        # 训练并且作出预测
        start = time.clock()
        clf = classifier_models[key].fit(X_train, y_train)
        y_pred, y_scores = train_classifier_and_predict(classifier_model, X_train, X_test, y_train)
        end  = time.clock()
        delta_time = end - start
        # 存储本次时间
        time = delta_time

In [None]:
def cii():
    global features
    
    old_feature = features['UB_feature']
    old_label = get_test_label(5, 1)
    new_feature = new_features['UB_feature']
    
    # 相关feature
    U_feature = features['U_feature']
    I_feature = features['I_feature']
    UI_feature = features['UB_feature']
    old_feature = UI_feature
    print(1)
    
    # 获得feature
    for key in UI_feature:
        old_feature[key] += U_feature[key[0]]
        old_feature[key] += I_feature[key[1]]
    print(2)
    
    # 获得label
    label = get_test_label(5, 1)

     # 7个分类器
    classifier_models = {
        # 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB)
        'GaussianNB': GaussianNB(),
        # K近邻分类器 KNeighborsClassifier
        'KNeighbors': KNeighborsClassifier(n_neighbors=2),
        # 决策树分类器 DecisionTreeClassifier
        'DecisionTree': tree.DecisionTreeClassifier(criterion='entropy', max_depth=50),
        # 随机森林 RandomForestClassifier
        'RandomForest': RandomForestClassifier(max_depth=3, n_estimators=45),
        # AdaBoost AdaBoostClassifier
        'AdaBoost': AdaBoostClassifier(n_estimators=150, learning_rate = 0.1),
        # Bagging meta-estimator（Bagging 元估计器）BaggingClassifier
        'Bagging': BaggingClassifier(n_estimators=40),
        # GBDT（梯度树提升）GradientBoostingClassifier
        'GradientBoosting': GradientBoostingClassifier(n_estimators=40, max_depth=10)
        
    }
    
    X_train = []
    y_train = []
    X_pred = []
    
    for key in old_feature:
        item = list(key) + old_feature[key]
        X_train.append(item)
        y_train.append(label[key])
    
    for key in new_feature:
        item = list(key) + new_feature[key]
        X_pred.append(item)
    
    for key in classifier_models:
        # 训练并且作出预测
        start = time.clock()
        clf = classifier_models[key].fit(X_train, y_train)
        y_pred, y_scores = train_classifier_and_predict(classifier_model, X_train, X_test, y_train)
        end  = time.clock()
        delta_time = end - start
        # 存储本次时间
        time = delta_time

In [None]:
def ciii():
    
     global features
    
     # 相关feature
    U_feature = features['U_feature']
    I_feature = features['I_feature']
    UI_feature = features['UB_feature']
    old_feature = UI_feature
    print(1)
    
    # 获得feature
    for key in UI_feature:
        old_feature[key] += U_feature[key[0]]
        old_feature[key] += I_feature[key[1]]
    print(2)
    
    # 获得label
    label = get_test_label(5, 1)

     # 7个分类器
    classifier_models = {
        # 高斯朴素贝叶斯分类器 Gaussian Naive Bayes (GaussianNB)
        'GaussianNB': GaussianNB(),
        # K近邻分类器 KNeighborsClassifier
        'KNeighbors': KNeighborsClassifier(n_neighbors=2),
        # 决策树分类器 DecisionTreeClassifier
        'DecisionTree': tree.DecisionTreeClassifier(criterion='entropy', max_depth=50),
        # 随机森林 RandomForestClassifier
        'RandomForest': RandomForestClassifier(max_depth=3, n_estimators=45),
        # AdaBoost AdaBoostClassifier
        'AdaBoost': AdaBoostClassifier(n_estimators=150, learning_rate = 0.1),
        # Bagging meta-estimator（Bagging 元估计器）BaggingClassifier
        'Bagging': BaggingClassifier(n_estimators=40),
        # GBDT（梯度树提升）GradientBoostingClassifier
        'GradientBoosting': GradientBoostingClassifier(n_estimators=40, max_depth=10)
        
    }
    
    X_train = []
    y_train = []
    X_pred = []
    
    for key in old_feature:
        item = list(key) + old_feature[key]
        X_train.append(item)
        y_train.append(label[key])
    
    for key in new_feature:
        item = list(key) + new_feature[key]
        X_pred.append(item)
    
    for key in classifier_models:
        # 训练并且作出预测
        start = time.clock()
        clf = classifier_models[key].fit(X_train, y_train)
        y_pred, y_scores = train_classifier_and_predict(classifier_model, X_train, X_test, y_train)
        end  = time.clock()
        delta_time = end - start
        # 存储本次时间
        time = delta_time

In [1]:
def civ():
     global features
    
    old_feature = features['U_feature']
    old_label = get_test_label(5, 1)
    new_feature = new_features['U_feature']

     # 7个回归
    models = {
    'GaussianNB': BayesianRidge(),
    'KNeighborsRegressor': KNeighborsRegressor(n_neighbors=5),
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state=0),
    'RandomForestRegressor': RandomForestRegressor(max_features=3, n_estimators=45),
    'BaggingRegressor': BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=0), random_state=0),
    'AdaBoostRegressor': AdaBoostRegressor(n_estimators=150, learning_rate=0.1),
    'GradientBoostingRegressor': GradientBoostingRegressor(max_depth=10, n_estimators=40),
}
    X_train = []
    y_train = []
    X_pred = []
    
    for key in old_feature:
        item = list(key) + old_feature[key]
        X_train.append(item)
        y_train.append(label[key])
    
    for key in new_feature:
        item = list(key) + new_feature[key]
        X_pred.append(item)
    
    for key in classifier_models:
        # 训练并且作出预测
        start = time.clock()
        clf = models[key].fit(X_train, y_train)
        y_pred, y_scores = train_classifier_and_predict(classifier_model, X_train, X_test, y_train)
        end  = time.clock()
        delta_time = end - start
        # 存储本次时间
        time = delta_time

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 4)