In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
def load_file(filepath):
    '''
    该函数实现加载单个文件
    '''
    dataframe = pd.read_csv(filepath, header=None, delim_whitespace=True)
    return dataframe.values

In [3]:
def load_dataset_group_raw(group, prefix=''):
    '''
    该函数实现加载未经处理的数据
    '''
    X = load_file(prefix + group + '/X_'+group+'.txt')
    y = load_file(prefix + group + '/y_'+group+'.txt')
    print('{}_X.shape:{},{}_y.shape:{}\n'.format(group,X.shape,group,y.shape))
    return X, y

In [4]:
def load_dataset_raw(prefix='D:/GraduationCode/01 Datasets/UCI HAR Dataset/'):
    trainX, trainy = load_dataset_group_raw('train', prefix)
    testX, testy = load_dataset_group_raw('test', prefix)
    trainy, testy = trainy[:,0], testy[:,0] # 展平标签
    return trainX, trainy, testX, testy

In [5]:
def define_models(models=dict()):
    '''
    创建标准模型的字典以评估 {name：object}
    '''
    # 非线性模型
    models['knn'] = KNeighborsClassifier(n_neighbors=7)
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    # 集成学习模型
    models['bag'] = BaggingClassifier(n_estimators=100)
    models['rf'] = RandomForestClassifier(n_estimators=100)
    models['et'] = ExtraTreesClassifier(n_estimators=100)
    models['gbm'] = GradientBoostingClassifier(n_estimators=100)
    print('Defined %d models' % len(models))
    
    return models

In [6]:
def evaluate_model(trainX, trainy, testX, testy, model):
    '''
    评估单个模型
    '''
    model.fit(trainX, trainy)
    yhat = model.predict(testX)
    accuracy = accuracy_score(testy, yhat) # 评估预测
    return accuracy * 100.0

def evaluate_models(trainX, trainy, testX, testy, models):
    '''
    计算模型{name：object}的字典，返回{name：score}
    '''
    results = dict()
    for name, model in models.items():
        results[name] = evaluate_model(trainX, trainy, testX, testy, model)
        print('>%s: %.3f' % (name, results[name]))
    return results

In [7]:
def summarize_results(results, maximize=True):
    '''
    创建一个（name,mean(scores)) 元组的列表
    '''
    mean_scores = [(k,v) for k,v in results.items()]
    # 按平均分数对元组进行排序
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # 反向按降序排列（例如精度）
    if maximize:
        mean_scores = list(reversed(mean_scores))
    print()
    for name, score in mean_scores:
        print('Name=%s, Score=%.3f' % (name, score))

In [8]:
trainX, trainy, testX, testy = load_dataset_raw()
models = define_models() # 模型列表
results = evaluate_models(trainX, trainy, testX, testy, models)
summarize_results(results)

FileNotFoundError: [Errno 2] No such file or directory: 'D:/GraduationCode/01 Datasets/UCI HAR Dataset/train/X_train.txt'

# Modeling Raw Data

In [9]:
def load_file(filepath):
    dataframe = pd.read_csv(filepath, header=None, delim_whitespace=True)
    return dataframe.values

def load_dataset(data_rootdir, dirname, group):
    '''
    该函数实现将训练数据或测试数据文件列表堆叠为三维数组
    '''
    filename_list = []
    filepath_list = []
    X = []
    
    # os.walk() 方法是一个简单易用的文件、目录遍历器，可以高效的处理文件、目录。
    for rootdir, dirnames, filenames in os.walk(data_rootdir + dirname):
        for filename in filenames:
            filename_list.append(filename)
            filepath_list.append(os.path.join(rootdir, filename))
        #print(filename_list)
        #print(filepath_list)
    
    # 遍历根目录下的文件，并读取为DataFrame格式；
    for filepath in filepath_list:
        X.append(load_file(filepath))
    
    X = np.dstack(X) # dstack沿第三个维度叠加，两个二维数组叠加后，前两个维度尺寸不变，第三个维度增加；
    y = load_file(data_rootdir+'/y_'+group+'.txt')
    print('{}_X.shape:{},{}_y.shape:{}\n'.format(group,X.shape,group,y.shape))
    return X, y


In [10]:
def load_dataset(prefix=''):
    trainX = trainX.reshape((trainX.shape[0], trainX.shape[1] * trainX.shape[2]))
    testX = testX.reshape((testX.shape[0], testX.shape[1] * testX.shape[2]))
    trainy, testy = trainy[:,0], testy[:,0]
    print(trainX.shape, trainy.shape, testX.shape, testy.shape)
    return trainX, trainy, testX, testy

In [11]:
results = evaluate_models(trainX, trainy, testX, testy, models)
summarize_results(results)

NameError: name 'trainX' is not defined