In [2]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import codecs


# 1. 读取数据
def robust_readcsv(path, sep=','):
    try:
        lines = codecs.open(path).readlines()
    except:
        lines = codecs.open(path, encoding='latin-1').readlines()
    header = lines[0].strip().split(sep)
    content = []
    for line in lines[1:]:
        line = line.strip()
        try:
            index = [i for i, x in enumerate(line) if x == ',']
            if len(index) == len(header) - 1:
                content.append(line.split(sep))
            else:
                line_content = []
                index = [0] + index
                for idx in range(len(header) - 1):
                    line_content.append(line[index[idx]:index[idx + 1]].strip(sep))
                line_content.append(line[index[len(header) - 1]:].strip(sep))
                content.append(line_content)
        except:
            pass

    return pd.DataFrame(content, columns=header)


def loadTrainData():
    # coding=utf-8

    import os

    #os.chdir(r'C:\Projects\智能家居场景识别挑战赛')
    train_cus = pd.read_csv('./dataset/train/cus.csv', sep=',')
    test_cus = pd.read_csv('./dataset/test/cus.csv', sep=',')

    train_devupdate = robust_readcsv('./dataset/train/devUpdata.csv', sep=',')
    test_devupdate = robust_readcsv('./dataset/test/devUpdata.csv', sep=',')

    train_control = robust_readcsv('./dataset/train/control.csv', sep=',')
    test_control = robust_readcsv('./dataset/test/control.csv', sep=',')

    train_devlist = robust_readcsv('./dataset/train/devList.csv', sep=',')
    test_devlist = robust_readcsv('./dataset/test/devList.csv', sep=',')

    # 数据分析

    train_devupdate_feat = train_devupdate.groupby('uid').agg({
        'did': 'nunique',
        'data': 'nunique',
    })
    train_devupdate_feat.reset_index(inplace=True)
    train_devupdate_feat.columns = ['uid', 'devupdate_did_count', 'devupdate_data_count']

    test_devupdate_feat = test_devupdate.groupby('uid').agg({
        'did': 'nunique',
        'data': 'nunique',
    })
    test_devupdate_feat.reset_index(inplace=True)
    test_devupdate_feat.columns = ['uid', 'devupdate_did_count', 'devupdate_data_count']

    train_control_feat = train_control.groupby('uid').agg({
        'did': 'nunique',
        'form': 'nunique',
        'data': 'nunique',
    })
    train_control_feat.reset_index(inplace=True)
    train_control_feat.columns = ['uid', 'devcontrol_did_count',
                                  'devcontrol_form_count', 'devcontrol_data_count']

    test_control_feat = test_control.groupby('uid').agg({
        'did': 'nunique',
        'form': 'nunique',
        'data': 'nunique',
    })
    test_control_feat.reset_index(inplace=True)
    test_control_feat.columns = ['uid', 'devcontrol_did_count',
                                 'devcontrol_form_count', 'devcontrol_data_count']

    train_devlist_feat = train_devlist.groupby('uid').agg({
        'did': 'nunique',
        'type': 'nunique',
        'area': ['unique', 'nunique', 'count']
    })
    train_devlist_feat.reset_index(inplace=True)
    train_devlist_feat.columns = [x[0] + x[1] for x in train_devlist_feat.columns]

    test_devlist_feat = test_devlist.groupby('uid').agg({
        'did': 'nunique',
        'type': 'nunique',
        'area': ['unique', 'nunique', 'count']
    })
    test_devlist_feat.reset_index(inplace=True)
    test_devlist_feat.columns = [x[0] + x[1] for x in test_devlist_feat.columns]

    train_feat = train_cus.merge(train_devlist_feat, on='uid')
    train_feat = train_feat.merge(train_control_feat, on='uid', how='left')
    train_feat = train_feat.merge(train_devupdate_feat, on='uid', how='left')
    train_feat.fillna(0, inplace=True)

    test_feat = test_cus.merge(test_devlist_feat, on='uid')
    test_feat = test_feat.merge(test_control_feat, on='uid', how='left')
    test_feat = test_feat.merge(test_devupdate_feat, on='uid', how='left')
    test_feat.fillna(0, inplace=True)

    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer(max_features=400)
    train_dev_tfidf = tfidf.fit_transform(train_feat['areaunique'].apply(lambda x: ' '.join(x)))
    test_dev_tfidf = tfidf.transform(test_feat['areaunique'].apply(lambda x: ' '.join(x)))

    train_dev_tfidf = pd.DataFrame(train_dev_tfidf.toarray(), columns=tfidf.get_feature_names())
    test_dev_tfidf = pd.DataFrame(test_dev_tfidf.toarray(), columns=tfidf.get_feature_names())

    train_feat = pd.concat([train_dev_tfidf, train_feat], axis=1)
    x = train_feat.drop(['uid', 'label', 'areaunique'], axis=1)
    y = train_feat['label']
    test_feat = pd.concat([test_dev_tfidf, test_feat], axis=1)
    x_test = test_feat.drop(['uid', 'areaunique'], axis=1)
    return x, y, x_test, test_cus


if __name__ == '__main__':
    trainData, trainLabel, x_test, test_cus = loadTrainData()
    print(trainLabel.value_counts())
    # 处理样本不平衡
    oversample = SMOTE()
    X_train_prepared_smote, y_train_smote = oversample.fit_resample(trainData, trainLabel)

    # 划分训练集和测试
    x_train, x_val, y_train, y_val = train_test_split(X_train_prepared_smote, y_train_smote, test_size=0.2, shuffle=True,
                                                      random_state=0)
    clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=200, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=200, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5),
            RandomForestClassifier(n_estimators=300, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=300, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=400, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.1, subsample=0.5, max_depth=6, n_estimators=5),
            ]

    # 多维数组，用来存放每个模型生成的训练集预测结果
    dataset_train = np.zeros((x_train.shape[0], len(clfs)))
    print('x_train.shape')
    print(x_train.shape)
    # 多维数组，用来存放每个模型生成的测试集预测结果
    dataset_val = np.zeros((x_val.shape[0], len(clfs)))
    dataset_test = np.zeros((x_test.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        # 用五种不一样初级训练集获得五个初级学习器
        clf.fit(x_train, y_train)
        y_submission = clf.predict(x_val)
        print(classification_report(y_val, y_submission))
        # 每一个初级学习器得出验证集的预测结果，也就是得出全部训练集的1/5
        # 在五次初级学习器循环预测完后就得到全部训练集的预测值
        dataset_train[:, j] = clf.predict(x_train)
        # 五个初级学习器将test交替训练一遍
        dataset_val[:, j] = clf.predict(x_val)
        dataset_test[:, j] = clf.predict(x_test)
    # 模型融合
    clf = GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=19, n_estimators=80)
    clf.fit(dataset_train, y_train)  # 此时dataset_train是训练集的预测值
    # 生成预测集结果向量
    # X[:,1]是numpy中数组的一种写法，表示对一个二维数组，取该二维数组第一维中的所有数据，第二维中取第1个数据
    y_submission = clf.predict(dataset_val)
    print(classification_report(y_val, y_submission))
    test_cus['label'] = clf.predict(dataset_test)
    print(test_cus['label'].value_counts())
    test_cus.to_csv('2022070502.csv', index=None)




0    855
1     60
Name: label, dtype: int64
x_train.shape
(1368, 409)
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       173
           1       0.96      0.98      0.97       169

    accuracy                           0.97       342
   macro avg       0.97      0.97      0.97       342
weighted avg       0.97      0.97      0.97       342

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       173
           1       0.96      0.98      0.97       169

    accuracy                           0.97       342
   macro avg       0.97      0.97      0.97       342
weighted avg       0.97      0.97      0.97       342

              precision    recall  f1-score   support

           0       1.00      0.95      0.97       173
           1       0.95      1.00      0.97       169

    accuracy                           0.97       342
   macro avg       0.97      0.97      0.97       342
weig