In [70]:
import pandas as pd
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from util.util import *
from features import *
import json
from scipy import stats

In [6]:
train_dir = './data/train/'
train = [ train_dir + f for f in os.listdir(train_dir)]
train = sorted(train)
test_dir = './data/test/'
test = [ test_dir + f for f in os.listdir(test_dir)]
test = sorted(test)
total = train + test

In [7]:
# Here are the codes of the CO-sensors and CO-alarms
sensors = ["CQ 32301 XQ01", "CQ 32302 XQ01", "CQ 32303 XQ01", "CQ 32304 XQ01", "CQ 32305 XQ01"]
alarms = ["CQ 32306 XH01", "CQ 32306 XH03", "CQ 32306 XH05"]

### read common feature from file

In [9]:
common_features = json.load(open("commonFeatures_new", 'r'))

In [10]:
train_features = common_features + sensors + alarms

### generate features and sampling

In [11]:
def readFile(fn, train_features, alarms):
    temp = pd.read_csv(fn,engine='python' )
        # if alarm doe not exist, then add a correspond column
    for alarm in alarms:
        if alarm not in temp.columns:
            temp[alarm] = 0

    temp = temp[train_features]
    return temp

In [12]:
def n_alarm_in_data(df):

    xh1 = df[df['CQ 32306 XH01']==1]
    return len(xh1)

In [13]:
def sample_in_campaign(campaign, n=2000, func=None, kwargs=None, alg='normal', na=True, verbose=True):
    
    sampled_df = pd.DataFrame()
    for fn in campaign:
        if verbose:
            print(fn)
        data = readFile(fn, train_features, alarms)
        # random sampling
        if alg == 'normal':
            if func is not None:
                data = func(data,**kwargs)
                if na is True:
                    data = data.dropna(axis=0,how='any')
                else:
                    data = data.fillna(na)
            data = data.sample(n)
        # random sample no alarm records and all alarm records
        if alg == 'all_alarms':
            n = n_alarm_in_data(data)
            data = random_sampling(dataset, diff=1, max_n_no_alarm=n, max_n_alarm = n)
        
        sampled_df = pd.concat([sampled_df, data], axis=0)
        del data
        
    return sampled_df

In [14]:
campaign1 = train[0:6]
campaign2 = train[6:13]
campaign3 = train[13:24]
campaign4 = train[24:36]
campaign5 = train[36:]
campaigns = [campaign1,campaign2,campaign3,campaign4,campaign5]

### sampling

In [38]:
%%time
sampled_df = pd.DataFrame()
kwargs = {'columns':common_features, 'label':sensors+alarms}
for campaign in campaigns:
    data = sample_in_campaign(campaign, n=2000, func=None, kwargs=kwargs)
    sampled_df = pd.concat([sampled_df, data], axis=0)
    del data
sampled_df

./data/train/data_2013_05_12.csv
./data/train/data_2013_05_13.csv
./data/train/data_2013_05_14.csv
./data/train/data_2013_05_15.csv
./data/train/data_2013_05_16.csv
./data/train/data_2013_05_17.csv
./data/train/data_2013_12_01.csv
./data/train/data_2013_12_02.csv
./data/train/data_2013_12_03.csv
./data/train/data_2013_12_04.csv
./data/train/data_2013_12_05.csv
./data/train/data_2013_12_06.csv
./data/train/data_2013_12_07.csv
./data/train/data_2014_05_04.csv
./data/train/data_2014_05_05.csv
./data/train/data_2014_05_06.csv
./data/train/data_2014_05_07.csv
./data/train/data_2014_05_08.csv
./data/train/data_2014_05_09.csv
./data/train/data_2014_05_10.csv
./data/train/data_2014_05_11.csv
./data/train/data_2014_05_12.csv
./data/train/data_2014_05_13.csv
./data/train/data_2014_05_14.csv
./data/train/data_2015_01_25.csv
./data/train/data_2015_01_26.csv
./data/train/data_2015_01_27.csv
./data/train/data_2015_01_28.csv
./data/train/data_2015_01_29.csv
./data/train/data_2015_01_30.csv
./data/tra

Unnamed: 0,time,CP 32403 XQ01,CP 32202 XQ01,CL 32303 XQ01,HK 32303 XQ01,CQ 32501_2 XQ01,HK 32409 XQ01,CT 32732 XQ01,CF 32915 XQ01,HK 32406 XQ01,...,CT 32203 XQ01,CT 32711 XQ01,CQ 32301 XQ01,CQ 32302 XQ01,CQ 32303 XQ01,CQ 32304 XQ01,CQ 32305 XQ01,CQ 32306 XH01,CQ 32306 XH03,CQ 32306 XH05
79800,2013-05-12 22:10:00,0.0112,-0.160000,103.12,348.2999,-2.5,40.64999,11.84999,0.0008,79.37999,...,69.56998,12.045,0.03,0.30,1.65,0.03,1.23,0.0,0.0,0.0
52366,2013-05-12 14:32:46,0.0112,-0.120000,103.13,348.5700,20.0,40.50000,11.84999,0.0000,79.37999,...,70.37997,13.065,-0.18,0.21,1.47,0.00,1.26,0.0,0.0,0.0
35892,2013-05-12 09:58:12,0.0112,-0.160000,103.17,348.3899,22.5,40.70999,12.20999,0.0008,79.34999,...,70.91998,12.090,0.00,0.24,1.59,0.03,1.17,0.0,0.0,0.0
2600,2013-05-12 00:43:20,0.0112,-0.160000,103.14,348.3899,15.0,40.64999,14.17199,0.0008,79.40999,...,72.81000,14.580,-0.06,0.27,1.56,-0.03,1.17,0.0,0.0,0.0
6727,2013-05-12 01:52:07,0.0120,-0.160000,103.15,348.5700,17.5,40.56000,14.02799,0.0008,79.37999,...,72.62997,14.025,-0.06,0.27,1.56,-0.06,1.17,0.0,0.0,0.0
63736,2013-05-12 17:42:16,0.0104,-0.160000,103.14,347.9399,32.5,40.35000,12.08399,0.0000,79.43999,...,70.01999,13.335,-0.12,0.30,1.56,0.00,1.23,0.0,0.0,0.0
42157,2013-05-12 11:42:37,0.0112,-0.160000,103.15,348.6600,12.5,40.53000,11.97598,0.0008,79.37999,...,70.46997,12.375,-0.09,0.21,1.59,0.03,1.23,0.0,0.0,0.0
24909,2013-05-12 06:55:09,0.0096,-0.160000,103.15,348.4800,22.5,40.74000,12.92999,0.0008,79.40999,...,71.81998,12.855,-0.06,0.30,1.59,0.03,1.17,0.0,0.0,0.0
63385,2013-05-12 17:36:25,0.0104,-0.160000,103.14,348.1199,27.5,40.59000,12.08399,0.0008,79.23000,...,70.01999,13.260,-0.06,0.30,1.65,0.00,1.23,0.0,0.0,0.0
72843,2013-05-12 20:14:03,0.0104,-0.160000,103.14,348.3899,7.5,40.38000,12.17399,0.0008,79.25999,...,69.74997,13.185,-0.12,0.24,1.62,0.00,1.29,0.0,0.0,0.0


In [42]:
sampled_df = sampled_df.reset_index(drop=True)
sampled_df['time'] = pd.to_datetime(sampled_df['time'])
sampled_df = sampled_df.set_index('time')
sampled_df
sampled_df.to_csv('./data/sampeld_none.csv')

Unnamed: 0_level_0,CP 32403 XQ01,CP 32202 XQ01,CL 32303 XQ01,HK 32303 XQ01,CQ 32501_2 XQ01,HK 32409 XQ01,CT 32732 XQ01,CF 32915 XQ01,HK 32406 XQ01,CL 32402 XQ01,...,CT 32203 XQ01,CT 32711 XQ01,CQ 32301 XQ01,CQ 32302 XQ01,CQ 32303 XQ01,CQ 32304 XQ01,CQ 32305 XQ01,CQ 32306 XH01,CQ 32306 XH03,CQ 32306 XH05
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-05-12 22:10:00,0.0112,-0.160000,103.12,348.2999,-2.5,40.64999,11.84999,0.0008,79.37999,62.22999,...,69.56998,12.045,0.03,0.30,1.65,0.03,1.23,0.0,0.0,0.0
2013-05-12 14:32:46,0.0112,-0.120000,103.13,348.5700,20.0,40.50000,11.84999,0.0000,79.37999,62.25999,...,70.37997,13.065,-0.18,0.21,1.47,0.00,1.26,0.0,0.0,0.0
2013-05-12 09:58:12,0.0112,-0.160000,103.17,348.3899,22.5,40.70999,12.20999,0.0008,79.34999,62.31999,...,70.91998,12.090,0.00,0.24,1.59,0.03,1.17,0.0,0.0,0.0
2013-05-12 00:43:20,0.0112,-0.160000,103.14,348.3899,15.0,40.64999,14.17199,0.0008,79.40999,62.26999,...,72.81000,14.580,-0.06,0.27,1.56,-0.03,1.17,0.0,0.0,0.0
2013-05-12 01:52:07,0.0120,-0.160000,103.15,348.5700,17.5,40.56000,14.02799,0.0008,79.37999,62.27999,...,72.62997,14.025,-0.06,0.27,1.56,-0.06,1.17,0.0,0.0,0.0
2013-05-12 17:42:16,0.0104,-0.160000,103.14,347.9399,32.5,40.35000,12.08399,0.0000,79.43999,62.23999,...,70.01999,13.335,-0.12,0.30,1.56,0.00,1.23,0.0,0.0,0.0
2013-05-12 11:42:37,0.0112,-0.160000,103.15,348.6600,12.5,40.53000,11.97598,0.0008,79.37999,62.27999,...,70.46997,12.375,-0.09,0.21,1.59,0.03,1.23,0.0,0.0,0.0
2013-05-12 06:55:09,0.0096,-0.160000,103.15,348.4800,22.5,40.74000,12.92999,0.0008,79.40999,62.33999,...,71.81998,12.855,-0.06,0.30,1.59,0.03,1.17,0.0,0.0,0.0
2013-05-12 17:36:25,0.0104,-0.160000,103.14,348.1199,27.5,40.59000,12.08399,0.0008,79.23000,62.24999,...,70.01999,13.260,-0.06,0.30,1.65,0.00,1.23,0.0,0.0,0.0
2013-05-12 20:14:03,0.0104,-0.160000,103.14,348.3899,7.5,40.38000,12.17399,0.0008,79.25999,62.23999,...,69.74997,13.185,-0.12,0.24,1.62,0.00,1.29,0.0,0.0,0.0


### feature selection

In [30]:
data = readFile(train[2], train_features, alarms)

In [32]:
del data

In [13]:
del data

# Feature Selection

In [38]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline



### train and test

In [52]:
sampled_df = sampled_df.sample(frac=1)
length = int(0.8*len(sampled_df))
train = sampled_df[0:length]
test = sampled_df[length:]
del sampled_df

In [53]:
X_train = train.drop(sensors+alarms, axis=1)
yl_train = [train[i] for i in sensors]
X_test = test.drop(sensors+alarms, axis=1)
yl_test = [test[i] for i in sensors]
del train
del test

In [43]:
def define_pipeline(ifpca=True,n_components=50):
    pipelines = []
    for i in range(5):
        scaler = StandardScaler()
        if ifpca:
            pca = PCA(n_components = n_components)
        param_dist = {'n_estimators':100,'max_depth': 4}
        xgb_model =  xgb.XGBRegressor(**param_dist)
        if ifpca:
            pipeline = Pipeline([("Standard scaler",scaler), 
                             ("PCA", pca), 
                             ("XGBoost", xgb_model)])
        else:
            pipeline = Pipeline([("Standard scaler",scaler), ("XGBoost", xgb_model)])
        pipelines.append(pipeline)
    return pipelines

In [67]:
def train_all_models(pipelines,features,X_train, X_test, yl_train, yl_test,verbose=True):
    mses = []
    i=0
    X_train = X_train[features]
    X_test = X_test[features]
    for y_train, y_test in zip(yl_train, yl_test):


        y_train = define_inf(y_train)
        y_test = define_inf(y_test)
        actuals = y_test

        pipelines[i].fit(X_train, y_train)
        if verbose:
            print("F-score:")
            print(pipelines[i].score(X_test, y_test))
        predictions = pipelines[i].predict(X_test)
        mse = mean_squared_error(actuals, predictions)
        
        print("MSE:")
        print(mse)
        mses.append(mse)
        i += 1
        del predictions
        del y_train
        del y_test

    mses = np.array(mses)
    
    print("average MSE:")
    print(np.mean(mses))
    return pipelines


In [None]:
pipelines = train_all_models(pipelines,X_train, X_test, yl_train, yl_test,verbose=True)

### get top n features

In [1]:
def get_top_n_features(pipelines, columns):
    
    top_f_list = []
    for i in range(len(pipelines)):
        a = pipelines[i].get_params()
        b = a.get('XGBoost')
        booster = b.booster()
        top_features = sorted(booster.get_fscore().iteritems(), key=lambda d:d[1], reverse = True)[0:10]
        li = []
        for f in top_features:
            name = f[0]
            index = int(name[1:])
            li.append(all_features[index])
        top_f_list.append(li)
    return top_f_list

In [118]:
all_features = X_train.columns.tolist()
features = get_top_n_features(pipelines, all_features)
pickle.dump(features,open('./data/features_none','w'),2)

[['CL 32402 XQ01', 'CP 32403 XQ01', 'CQ 32501_2 XQ01', 'HK 32409 XQ01', 'CP 32202 XQ01', 'CL 32303 XQ01', 'HK 32423 XQ01', 'CF 32916 XQ01', 'CF 32923 XQ01', 'CF 32412 XQ01'], ['CP 32403 XQ01', 'CP 32202 XQ01', 'CL 32303 XQ01', 'CL 32402 XQ01', 'HK 32409 XQ01', 'CP 32906 XQ01', 'CT 32402 XQ01', 'CF 32915 XQ01', 'CT 32732 XQ01', 'HK 32303 XQ01'], ['CP 32202 XQ01', 'CL 32303 XQ01', 'CP 32403 XQ01', 'CP 32906 XQ01', 'CF 32915 XQ01', 'CL 32402 XQ01', 'HK 32303 XQ01', 'LV 32402 XQ01', 'CF 32501 XQ01', 'CP 32418 XQ01'], ['CP 32403 XQ01', 'CP 32202 XQ01', 'CT 32732 XQ01', 'CP 32906 XQ01', 'CL 32303 XQ01', 'CF 32915 XQ01', 'CL 32402 XQ01', 'CF 32201 XQ01', 'CT 32716 XQ01', 'CT 32402 XQ01'], ['CP 32202 XQ01', 'CP 32403 XQ01', 'HK 32409 XQ01', 'CL 32303 XQ01', 'CL 32402 XQ01', 'CF 32923 XQ01', 'CQ 32501_2 XQ01', 'HK 32406 XQ01', 'CP 32906 XQ01', 'HK 32303 XQ01']]


### prediction

In [3]:
def read_campaign_with_features(campaign, features, n=False,func=None, kwargs=None, na=True, verbose=True):
    sampled_df = pd.DataFrame()
    for fn in campaign:
        if verbose:
            print(fn)
        data = readFile(fn, train_features, alarms)
        
        if func is not None:
            data = func(data,**kwargs)
            if na is True:
                data = data.dropna(axis=0,how='any')
            else:
                data = data.fillna(na)
        if n:
            data = data.sample(n)
            C
        data = data[features]
        
        sampled_df = pd.concat([sampled_df, data], axis=0)
        del data
        
    return sampled_df
    

In [31]:
%%time
features = pickle.load(open('./data/features_none', 'r'))
comm_features = []
for f in features:
    comm_features.extend(f)

comm_features = pd.Series(comm_features)
comm_features = pd.unique(comm_features).tolist()

sampled_df = pd.DataFrame()
kwargs = {'columns':common_features, 'label':sensors+alarms}
comm_features = comm_features + sensors + alarms

data1 = read_campaign_with_features(campaign1, comm_features,func=None, kwargs=None, na=True, verbose=True)
data2 = read_campaign_with_features(campaign2, comm_features,func=None, kwargs=None, na=True, verbose=True)
data3 = read_campaign_with_features(campaign3, comm_features,func=None, kwargs=None, na=True, verbose=True)
data4 = read_campaign_with_features(campaign4, comm_features,func=None, kwargs=None, na=True, verbose=True)
data5 = read_campaign_with_features(campaign5, comm_features,func=None, kwargs=None, na=True, verbose=True)
data = [data1,data2,data3,data4,data5]

./data/train/data_2013_05_12.csv
./data/train/data_2013_05_13.csv
./data/train/data_2013_05_14.csv
./data/train/data_2013_05_15.csv
./data/train/data_2013_05_16.csv
./data/train/data_2013_05_17.csv
./data/train/data_2013_12_01.csv
./data/train/data_2013_12_02.csv
./data/train/data_2013_12_03.csv
./data/train/data_2013_12_04.csv
./data/train/data_2013_12_05.csv
./data/train/data_2013_12_06.csv
./data/train/data_2013_12_07.csv
./data/train/data_2014_05_04.csv
./data/train/data_2014_05_05.csv
./data/train/data_2014_05_06.csv
./data/train/data_2014_05_07.csv
./data/train/data_2014_05_08.csv
./data/train/data_2014_05_09.csv
./data/train/data_2014_05_10.csv
./data/train/data_2014_05_11.csv
./data/train/data_2014_05_12.csv
./data/train/data_2014_05_13.csv
./data/train/data_2014_05_14.csv
./data/train/data_2015_01_25.csv
./data/train/data_2015_01_26.csv
./data/train/data_2015_01_27.csv
./data/train/data_2015_01_28.csv
./data/train/data_2015_01_29.csv
./data/train/data_2015_01_30.csv
./data/tra

In [40]:
def generate_train_test(data_list, test_num=0):
    
    test = data_list[test_num]
    train = pd.DataFrame()
    
    for i,d in enumerate(data_list):
        if i != test_num:
            train = pd.concat([train, d], axis=0)
        
    
    X_train = train.drop(sensors+alarms, axis=1)
    yl_train = [train[i] for i in sensors]
    X_test = test.drop(sensors+alarms, axis=1)
    yl_test = [test[i] for i in sensors]
    del train
    del test
    return X_train,yl_train, X_test,yl_test
    

### train models and test

In [68]:
data5 =  data5.fillna(0)
for i in range(5):
    X_train, yl_train, X_test, yl_test = generate_train_test(data, test_num=i)
    print('test data : campaign {}'.format(i))
    pipelines = define_pipeline(ifpca=True,n_components=10)
    pipelines = train_all_models(pipelines,features[i],X_train, X_test, yl_train, yl_test,verbose=True)
    del X_train
    del yl_train
    del X_test
    del yl_test
    

test data : campaign 0


NameError: global name 'np' is not defined

In [71]:
pipelines = train_all_models(pipelines,features[i],X_train, X_test, yl_train, yl_test,verbose=True)

NameError: global name 'np' is not defined

In [115]:
df2 = campaign1[1]
df2 = df2.set_index('time')
df2.index = pd.to_datetime(df2.index)
X = df2.drop(sensors+alarms, axis=1)
yl = [df[i] for i in sensors]
mses = []
for i, y in enumerate(yl):
    actuals = y
    predictions = pipelines[i].predict(X)
    print("score:")
    print(pipelines[i].score(X, y))
    mse = mean_squared_error(actuals, predictions)
    mses.append(mse)
    print("MSE:")
    print(mse)
mses = np.array(mses)
print("average MSE:")
print(np.mean(mses))

score:
-0.000715835871543
MSE:
0.0155230170149
score:
-0.111711356875
MSE:
0.0102317670896
score:
-0.0368301640818
MSE:
0.0131677697966
score:
-4.62055153532
MSE:
0.184248029938
score:
-0.192173729819
MSE:
0.00678166215812
average MSE:
0.0459904491994


In [105]:
campaign2 = readData(test[0:2], train_features, alarms)

./data/test/data_2013_05_12.csv read successfully
./data/test/data_2013_05_13.csv read successfully


In [119]:
df3 = campaign2[0]
df3 = df3.set_index('time')
df3.index = pd.to_datetime(df3.index)
X = df3.drop(sensors+alarms, axis=1)

yl = [df[i] for i in sensors]
mses = []
for i, y in enumerate(yl):
    actuals = y
    predictions = pipelines[i].predict(X)
    print("score:")
    print(pipelines[i].score(X, y))
    mse = mean_squared_error(actuals, predictions)
    mses.append(mse)
    print("MSE:")
    print(mse)
mses = np.array(mses)
print("average MSE:")
print(np.mean(mses))

score:
0.818557822106
MSE:
0.00281451528367
score:
0.762466252672
MSE:
0.00218616996538
score:
0.695984824539
MSE:
0.00386100056097
score:
0.535155063282
MSE:
0.0152381422497
score:
0.649494501452
MSE:
0.00199384520583
average MSE:
0.0052187346531


In [101]:
df3[sensors].describe()

Unnamed: 0,CQ 32301 XQ01,CQ 32302 XQ01,CQ 32303 XQ01,CQ 32304 XQ01,CQ 32305 XQ01
count,86400.0,86400.0,86400.0,86400.0,86400.0
mean,-0.052628,0.258398,1.59455,-0.008808,1.243319
std,0.065752,0.041591,0.03413,0.049914,0.067281
min,-0.27,0.09,1.44,-0.21,1.05
25%,-0.09,0.21,1.59,-0.06,1.2
50%,-0.06,0.27,1.59,0.0,1.23
75%,0.0,0.3,1.62,0.03,1.26
max,0.09,0.36,1.74,0.12,1.47


In [119]:
feature_dict = models[0].booster().get_fscore()
feature_dict = sorted(feature_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
feature_dict[0:30]

[('HK 32422 XQ01', 48),
 ('CP 32403 XQ01', 27),
 ('HK 32901 XQ01', 26),
 ('CQ 32502 XQ01', 26),
 ('CQ 32501_2 XQ01', 22),
 ('CT 32912 XQ01', 20),
 ('HK 32405 XQ01', 19),
 ('HK 32409 XQ01', 18),
 ('CT 32212 XQ01', 17),
 ('CT 32101 XQ01', 17),
 ('CP 32202 XQ01', 17),
 ('CP 32102 XQ01', 16),
 ('CT 32721 XQ01', 14),
 ('CF 32923 XQ01', 14),
 ('CQ 32505 XQ01', 14),
 ('CT 32409 XQ01', 13),
 ('CT 32410 XQ01', 13),
 ('CP 32100 XQ01', 13),
 ('HK 32416 XQ01', 13),
 ('CF 32703 XQ01', 13),
 ('HK 32413 XQ01', 12),
 ('CP 32903 XQ01', 12),
 ('CT 32213 XQ01', 12),
 ('FV 32902 XQ01', 12),
 ('CP 32922 XQ01', 12),
 ('HK 32424 XQ01', 12),
 ('CT 32207 XQ01', 12),
 ('CL 32303 XQ01', 12),
 ('HK 32407 XQ01', 12),
 ('CP 32418 XQ01', 11)]

# Spearmann Correlation

In [29]:
# example
stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])

SpearmanrResult(correlation=0.82078268166812329, pvalue=0.088587005313543812)