In [1]:
import pandas as pd
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from util.util import *
from features import *
import json
from scipy import stats

In [2]:
train_dir = '/home/i40/data/AGDTask2/train/'
train = [ train_dir + f for f in os.listdir(train_dir)]
train = sorted(train)
test_dir = '/home/i40/data/AGDTask2/test/'
test = [ test_dir + f for f in os.listdir(test_dir)]
test = sorted(test)
total = train + test

In [3]:
# Here are the codes of the CO-sensors and CO-alarms
sensors = ["CQ 32301 XQ01", "CQ 32302 XQ01", "CQ 32303 XQ01", "CQ 32304 XQ01", "CQ 32305 XQ01"]
alarms = ["CQ 32306 XH01", "CQ 32306 XH03", "CQ 32306 XH05"]

### read common feature from file

In [4]:
common_features = json.load(open("commonFeatures_new", 'r'))

In [5]:
train_features = common_features + sensors + alarms

### generate features and sampling

In [6]:
def readFile(fn, train_features, alarms):
    temp = pd.read_csv(fn,engine='python' )
        # if alarm doe not exist, then add a correspond column
    for alarm in alarms:
        if alarm not in temp.columns:
            temp[alarm] = 0

    temp = temp[train_features]
    return temp

In [7]:
def n_alarm_in_data(df):

    xh1 = df[df['CQ 32306 XH01']==1]
    return len(xh1)

In [15]:
a=[1,2] 
if type(a) is list:
    print a

[1, 2]


In [11]:
def sample_in_campaign(campaign, n=2000, func=None, kwargs=None, alg='normal', na=True, verbose=True,\
                       features=None, sample=True):
    
    
    
    
    sampled_df = pd.DataFrame()
            
    for fn in campaign:
        if verbose:
            print(fn)
        data = readFile(fn, train_features, alarms)
        # random sampling
        if alg == 'normal':
            
            if type(func) is list:
                dataset = pd.DataFrame()
                for fun in func:
                    if fun is None:
                        if verbose:
                            print(fun)
                        d = data
                    else:
                        d = fun(data,**kwargs)
                        # here is a bug
                        d = d.drop(sensors+alarms,axis=1)
                    if na is True:
                        d = d.dropna(axis=0,how='any')
                        d = d.reset_index(drop=True)
                    else:
                        d = d.fillna(na)
                    dataset = pd.concat([dataset, d], axis=1)
                if sample:
                    data = dataset.sample(n) 
                elif features is not None:
                    data = dataset[features]
                else:
                    data = dataset
                if verbose:
                    print(data.shape)
                del dataset  


            else:
                if func is not None:
                
                    data = func(data,**kwargs)
                if na is True:
                    data = data.dropna(axis=0,how='any')
                    data = data.reset_index(drop=True)
                else:
                    data = data.fillna(na)

                data = data.sample(n)
                
                
        # random sample no alarm records and all alarm records
        if alg == 'all_alarms':
            n = n_alarm_in_data(data)
            if n==0:
                if verbose:
                    print('continue')
                continue
            if type(func) is list:
                dataset = pd.DataFrame()
                for fun in func:
                    if fun is None:
                        if verbose:
                            print(fun)
                        d = data
                    else:
                        d = fun(data,**kwargs)
                        # here is a bug
                        d = d.drop(sensors+alarms,axis=1)
                    if na is True:
                        d = d.dropna(axis=0,how='any')
                        d = d.reset_index(drop=True)
                    else:
                        d = d.fillna(na)
                    dataset = pd.concat([dataset, d], axis=1)
                    
                
                data = random_sampling(dataset, diff=1, max_n_no_alarm=n, max_n_alarm = n)
                
                del dataset  
            else:
                n = n_alarm_in_data(data)
                data = random_sampling(data, diff=1, max_n_no_alarm=n, max_n_alarm = n)
        
        
        sampled_df = pd.concat([sampled_df, data], axis=0)
        del data
        
    return sampled_df

In [9]:
campaign1 = train[0:6]
campaign2 = train[6:13]
campaign3 = train[13:24]
campaign4 = train[24:36]
campaign5 = train[36:]
campaigns = [campaign1,campaign2,campaign3,campaign4,campaign5]

### sampling

### model 1

In [21]:
%%time
sampled_df = pd.DataFrame()
common_features = list_diff(common_features,['time'])
kwargs = {'columns':common_features, 'label':sensors+alarms}
func = [None,cf_mean_window,cf_std_window,cf_var_window,cf_diff,cf_kurtosis_window,\
        cf_skew_window,cf_max_window,cf_min_window]
for campaign in campaigns:
    data = sample_in_campaign(campaign, n=2000, func=func, na=1000,kwargs=kwargs)
    sampled_df = pd.concat([sampled_df, data], axis=0)
    del data
sampled_df

/home/i40/data/AGDTask2/train/data_2013_05_12.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_05_13.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_05_14.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_05_15.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_05_16.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_05_17.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_01.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_02.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_03.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_04.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_05.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_06.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2013_12_07.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2014_05_04.csv
None
(2000, 2179)
/home/i40/data/AGDTask2/train/data_2014_05_05.cs

### model 2

In [12]:
%%time
sampled_df = pd.DataFrame()
common_features = list_diff(common_features,['time'])
kwargs = {'columns':common_features, 'label':sensors+alarms}
func = [None,cf_mean_window,cf_std_window,cf_var_window,cf_diff,cf_kurtosis_window,\
        cf_skew_window,cf_max_window,cf_min_window]
for campaign in campaigns:
    data = sample_in_campaign(campaign, n=2000, func=func, na=1000,kwargs=kwargs,alg='all_alarms')
    sampled_df = pd.concat([sampled_df, data], axis=0)
    del data
sampled_df

/home/i40/data/AGDTask2/train/data_2013_05_12.csv
continue
/home/i40/data/AGDTask2/train/data_2013_05_13.csv
continue
/home/i40/data/AGDTask2/train/data_2013_05_14.csv
None
/home/i40/data/AGDTask2/train/data_2013_05_15.csv
None
/home/i40/data/AGDTask2/train/data_2013_05_16.csv
continue
/home/i40/data/AGDTask2/train/data_2013_05_17.csv
continue
/home/i40/data/AGDTask2/train/data_2013_12_01.csv
continue
/home/i40/data/AGDTask2/train/data_2013_12_02.csv
continue
/home/i40/data/AGDTask2/train/data_2013_12_03.csv
None
/home/i40/data/AGDTask2/train/data_2013_12_04.csv
None
/home/i40/data/AGDTask2/train/data_2013_12_05.csv
None
/home/i40/data/AGDTask2/train/data_2013_12_06.csv
None
/home/i40/data/AGDTask2/train/data_2013_12_07.csv
continue
/home/i40/data/AGDTask2/train/data_2014_05_04.csv
continue
/home/i40/data/AGDTask2/train/data_2014_05_05.csv
continue
/home/i40/data/AGDTask2/train/data_2014_05_06.csv
None
/home/i40/data/AGDTask2/train/data_2014_05_07.csv
None
/home/i40/data/AGDTask2/train

In [13]:
print(sampled_df.shape)
sampled_df.to_csv('./data/sampled_model2_data.csv')

In [None]:
sampled_df = sampled_df.drop(['cf_diff_time'],axis=1)

In [None]:

sampled_df = sampled_df.reset_index(drop=True)
sampled_df['time'] = pd.to_datetime(sampled_df['time'])
sampled_df = sampled_df.set_index('time')
sampled_df
#sampled_df.to_csv('./data/sampeld_cf_mean_window.csv')

### feature selection

In [30]:
data = readFile(train[2], train_features, alarms)

In [32]:
del data

In [13]:
del data

# Feature Selection

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline



### train and test

In [16]:
sampled_df = sampled_df.sample(frac=1)
length = int(0.8*len(sampled_df))
train = sampled_df[0:length]
test = sampled_df[length:]
del sampled_df

In [17]:
X_train = train.drop(sensors+alarms, axis=1)
yl_train = [train[i] for i in sensors]
X_test = test.drop(sensors+alarms, axis=1)
yl_test = [test[i] for i in sensors]
del train
del test

In [18]:
def define_pipeline(ifpca=True,n_components=50):
    pipelines = []
    for i in range(5):
        scaler = StandardScaler()
        if ifpca:
            pca = PCA(n_components = n_components)
        param_dist = {'n_estimators':100,'max_depth': 4}
        xgb_model =  xgb.XGBRegressor(**param_dist)
        if ifpca:
            pipeline = Pipeline([("Standard scaler",scaler), 
                             ("PCA", pca), 
                             ("XGBoost", xgb_model)])
        else:
            pipeline = Pipeline([("Standard scaler",scaler), ("XGBoost", xgb_model)])
        pipelines.append(pipeline)
    return pipelines

In [53]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
def train_all_models(pipelines,X_train, X_test, yl_train, yl_test,columns=None,verbose=True,\
                     alarm=False,alarm_true=None):
    mses = []
    i=0
    if columns is not None:
        X_train = X_train.loc[:,columns]
        X_test = X_test.loc[:,columns]
    if alarm:
        y_alarms = pd.DataFrame()
        y_alarms_true = alarm_true


    for y_train, y_test in zip(yl_train, yl_test):


        y_train = define_inf(y_train)
        y_test = define_inf(y_test)
        actuals = y_test

        pipelines[i].fit(X_train, y_train)
        if verbose:
            print("F-score:")
            print(pipelines[i].score(X_test, y_test))
        predictions = pipelines[i].predict(X_test)
        mse = mean_squared_error(actuals, predictions)
        
        if alarm:
            predictions = pd.DataFrame(predictions)
            y_alarms = pd.concat([y_alarms,predictions],axis=1)
        print("MSE: {}".format(mse))
        mses.append(mse)
        i += 1
        del predictions
        del y_train
        del y_test

    if alarm:
        def calc_alarm(row):
            return (row > 30).any()
        y_alarm_pre = y_alarms.apply(lambda row: calc_alarm(row), axis=1 )
        recall = recall_score(y_alarms_true,y_alarm_pre)
        precision = precision_score(y_alarms_true,y_alarm_pre)
        
        result = 5*(recall*precision)/(4*precision+recall)
        print('recall score: {}'.format(recall))
        print('precision score {}'.format(precision))
        print('f2 score: {}'.format(result))
        return pipelines, result
        
    mses = np.array(mses)
    mmses = np.mean(mses)
    print("average MSE: {}".format(mmses))
    return pipelines, mmses


In [39]:
print(alarms)
def or_op(row):
    return (row > 30).any()
data6[alarms].apply(lambda row: or_op(row), axis=1 )



 ['CQ 32306 XH01', 'CQ 32306 XH03', 'CQ 32306 XH05']
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

KeyboardInterrupt: 

In [None]:
i = 0
#yl_train[i].columns = ['CQ 32302 XQ01','a','a','a','a','a','a','a','a']
#yl_train[i] = yl_train[i][['CQ 32302 XQ01']]
s = 'CQ 3230'+str(i+1)+' XQ01'
print s
yl_train[i] = pd.DataFrame(yl_train[i].values,columns=[s])
yl_train[i]

In [None]:
i = 4
s = 'CQ 3230'+str(i+1)+' XQ01'
yl_test[i].columns = [s,'a','a','a','a','a','a','a','a']
yl_test[i] = yl_test[i][[s]]

#print s
#yl_train[i] = pd.DataFrame(yl_train[i].values,columns=[s])
yl_test[i]

In [21]:
X_train = X_train.drop(['time'],axis=1)
X_test = X_test.drop(['time'],axis=1)

In [22]:
pipelines = define_pipeline(ifpca=False,n_components=10)
pipelines, result = train_all_models(pipelines,X_train, X_test, yl_train, yl_test,verbose=True)

F-score:
0.805831086375
MSE: 44.4557960453
F-score:
0.846718025829
MSE: 0.661215595227
F-score:
0.93533526956
MSE: 0.385062460537
F-score:
0.968580914717
MSE: 10.2255368615
F-score:
0.960914415354
MSE: 0.705276024025
average MSE: 11.2865773973


### get top n features

In [23]:
def get_top_n_features(pipelines, columns, n=10):
    
    # for every sensor
    top_f_list = []
    for i in range(len(pipelines)):
        a = pipelines[i].get_params()
        b = a.get('XGBoost')
        booster = b.booster()
        top_features = sorted(booster.get_fscore().iteritems(), key=lambda d:d[1], reverse = True)[0:n]
        li = []
        for f in top_features:
            name = f[0]
            index = int(name[1:])
            li.append(all_features[index])
        top_f_list.append(li)
    return top_f_list

In [24]:
all_features = X_train.columns.tolist()

features = get_top_n_features(pipelines, all_features, n=20)
pickle.dump(features,open('./data/features_model2_20','w'),2)

### prediction

In [27]:
def read_campaign_with_features(campaign, features, n=False,func=None, kwargs=None, na=True, verbose=True):
    sampled_df = pd.DataFrame()
    for fn in campaign:
        if verbose:
            print(fn)
        data = readFile(fn, train_features, alarms)
        
        if func is not None:
            data = func(data,**kwargs)
            if na is True:
                data = data.dropna(axis=0,how='any')
            else:
                data = data.fillna(na)
        if n:
            data = data.sample(n)
            
        data = data[features]
        
        sampled_df = pd.concat([sampled_df, data], axis=0)
        del data
        
    return sampled_df
    

### model 1

In [26]:
features = pickle.load(open('./data/features_all_top_10', 'r'))
comm_features = []
for f in features:
    comm_features.extend(f)

comm_features = pd.Series(comm_features)
comm_features = pd.unique(comm_features).tolist()
comm_features = list_diff(comm_features,['time'])
comm_features

['CP 32403 XQ01',
 'HK 32304 XQ01',
 'CT 32304 XQ01',
 'CP 32202 XQ01',
 'CT 32202 XQ01',
 u'cf_max_HK 32304 XQ01',
 'CL 32303 XQ01',
 u'cf_min_CP 32503 XQ01',
 'CT 32410 XQ01',
 u'cf_mean_CT 32410 XQ01',
 'CP 32414 XQ01',
 'CT 32717 XQ01',
 'CT 32502 XQ01',
 u'cf_max_CP 32205 XQ01',
 u'cf_mean_VM 32904 XQ01',
 'CT 32413 XQ01',
 u'cf_diff_HK 32402 XQ01',
 u'cf_std_CP 32704 XQ01',
 'CP 32906 XQ01',
 'F 32411 S XQ21',
 'HK 32303 XQ01',
 'CT 32213 XQ01',
 u'cf_mean_CT 32102 XQ01',
 'HK 32206 XQ01',
 u'cf_max_CP 32203 XQ01',
 u'cf_diff_HK 32306 XQ01',
 u'cf_kurt_CF 32705 XQ01',
 'CP 32205 XQ01',
 u'cf_min_CP 32205 XQ01',
 u'cf_diff_CT 32304 XQ01',
 u'cf_std_CP 32207 XQ01',
 u'cf_kurt_CT 32414 XQ01',
 'CP 32921 XQ01',
 'CP 32104 XQ01',
 u'cf_kurt_HK 32310 XQ01',
 'CT 32402 XQ01',
 u'cf_skew_CF 32902 XQ01',
 u'cf_mean_CT 32717 XQ01']

In [119]:
%%time
features = pickle.load(open('./data/features_all_top_10', 'r'))
comm_features = []
for f in features:
    comm_features.extend(f)

comm_features = pd.Series(comm_features)
comm_features = pd.unique(comm_features).tolist()
comm_features = list_diff(comm_features,['time'])
sampled_df = pd.DataFrame()
kwargs = {'columns':common_features, 'label':sensors+alarms}
comm_features = comm_features + sensors + alarms
func = [None,cf_mean_window,cf_std_window,cf_var_window,cf_diff,cf_kurtosis_window,\
        cf_skew_window,cf_max_window,cf_min_window]

data1 = sample_in_campaign(campaign1, n=2000, func=func, kwargs=kwargs, alg='normal', na=1000, verbose=True,\
                       features=comm_features, sample=False)
data2 = sample_in_campaign(campaign2, n=2000, func=func, kwargs=kwargs, alg='normal', na=1000, verbose=True,\
                       features=comm_features, sample=False)
data3 = sample_in_campaign(campaign3, n=2000, func=func, kwargs=kwargs, alg='normal', na=1000, verbose=True,\
                       features=comm_features, sample=False)
data4 = sample_in_campaign(campaign4, n=2000, func=func, kwargs=kwargs, alg='normal', na=1000, verbose=True,\
                       features=comm_features, sample=False)
data5 = sample_in_campaign(campaign5, n=2000, func=func, kwargs=kwargs, alg='normal', na=1000, verbose=True,\
                       features=comm_features, sample=False)
#data5 = read_campaign_with_features(campaign5, comm_features,func=cf_skew_window, kwargs=kwargs, na=1000, verbose=True)
data = [data1,data2,data3,data4,data5]

/home/i40/data/AGDTask2/train/data_2013_05_12.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_05_13.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_05_14.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_05_15.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_05_16.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_05_17.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_01.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_02.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_03.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_04.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_05.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_06.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2013_12_07.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2014_05_04.csv
None
(86400, 101)
/home/i40/data/AGDTask2/train/data_2014_05_05.cs

### model 2

In [27]:
%%time
features = pickle.load(open('./data/features_model2_20', 'r'))
comm_features = []
for f in features:
    comm_features.extend(f)

comm_features = pd.Series(comm_features)
comm_features = pd.unique(comm_features).tolist()
comm_features = list_diff(comm_features,['time'])

sampled_df = pd.DataFrame()
kwargs = {'columns':common_features, 'label':sensors+alarms}
comm_features = comm_features + sensors + alarms
func = [None,cf_mean_window,cf_std_window,cf_var_window,cf_diff,cf_kurtosis_window,\
        cf_skew_window,cf_max_window,cf_min_window]

data6 = sample_in_campaign(campaign1, n=2000, func=func, kwargs=kwargs, alg='normal', na=1000, verbose=True,\
                       features=comm_features, sample=False)

#data5 = read_campaign_with_features(campaign5, comm_features,func=cf_skew_window, kwargs=kwargs, na=1000, verbose=True)


/home/i40/data/AGDTask2/train/data_2013_05_12.csv
None
(86400, 92)
/home/i40/data/AGDTask2/train/data_2013_05_13.csv
None
(86400, 92)
/home/i40/data/AGDTask2/train/data_2013_05_14.csv
None
(86400, 92)
/home/i40/data/AGDTask2/train/data_2013_05_15.csv
None
(86400, 92)
/home/i40/data/AGDTask2/train/data_2013_05_16.csv
None
(86400, 92)
/home/i40/data/AGDTask2/train/data_2013_05_17.csv
None
(86400, 92)
CPU times: user 3min 42s, sys: 35.7 s, total: 4min 17s
Wall time: 4min 17s


In [None]:
data4.to_csv('./data/data4.csv')

In [125]:
def generate_train_test(data_list, test_num=0):
    
    test = data_list[test_num]
    train = pd.DataFrame()
    
    for i,d in enumerate(data_list):
        if i != test_num:
            train = pd.concat([train, d], axis=0)
        
    
    X_train = train.drop(sensors+alarms, axis=1)
    yl_train = [train[i] for i in sensors]
    X_test = test.drop(sensors+alarms, axis=1)
    yl_test = [test[i] for i in sensors]
    del train
    del test
    return X_train,yl_train, X_test,yl_test
    

### train models and test

In [None]:
data5 =  data5.fillna(1000)
results = []
for i in range(5):
    X_train, yl_train, X_test, yl_test = generate_train_test(data, test_num=i)
    print('-------------------test data : campaign {}'.format(i+1))
    pipelines = define_pipeline(ifpca=True,n_components=1)
    
    pipelines, result = train_all_models(pipelines,X_train, X_test, yl_train, yl_test,columns=features[i],\
                                         verbose=False)
    results.append(result)
    del X_train
    del yl_train
    del X_test
    del yl_test
results = np.array(results)
final_result = np.mean(results)
print('final result: {}'.format(final_result))
    

-------------------test data : campaign 1


### model 2

In [54]:
features = pickle.load(open('./data/features_model2_20', 'r'))
comm_features = []
for f in features:
    comm_features.extend(f)

comm_features = pd.Series(comm_features)
comm_features = pd.unique(comm_features).tolist()
comm_features = list_diff(comm_features,['time'])

train = pd.read_csv('./data/sampled_model2_data.csv')
train = train[comm_features+sensors+alarms]
X_train = train.drop(sensors+alarms, axis=1)
yl_train = [train[i] for i in sensors]

X_test = data6.drop(sensors+alarms, axis=1)
yl_test = [data6[i] for i in sensors]
alarm_true = data6['CQ 32306 XH01']

pipelines = define_pipeline(ifpca=True,n_components=10)
    
pipelines, result = train_all_models(pipelines,X_train, X_test, yl_train, yl_test,columns=None,\
                                         verbose=False, alarm=True, alarm_true=alarm_true)

MSE: 9.80515483883
MSE: 6.39845878253
MSE: 0.79282394631
MSE: 32.9307934641
MSE: 9.01122792604
recall score: 0.858644859813
precision score 0.102567680714
f2 score: 0.347025495751


In [55]:
features = pickle.load(open('./data/features_model2_20', 'r'))
comm_features = []
for f in features:
    comm_features.extend(f)

comm_features = pd.Series(comm_features)
comm_features = pd.unique(comm_features).tolist()
comm_features = list_diff(comm_features,['time'])
comm_features

['CP 32403 XQ01',
 'CL 32303 XQ01',
 'CT 32402 XQ01',
 'CT 32304 XQ01',
 u'cf_skew_HK 32301 XQ01',
 u'cf_min_HK 32304 XQ01',
 u'cf_kurt_CT 32702 XQ01',
 'HK 32304 XQ01',
 'CT 32202 XQ01',
 u'cf_min_CT 32304 XQ01',
 u'cf_kurt_TV 32502 XQ01',
 u'cf_kurt_HK 32205 XQ01',
 u'cf_kurt_HK 32401 XQ01',
 u'cf_skew_HK 32309 XQ01',
 u'cf_skew_HK 32413 XQ01',
 u'cf_skew_CT 32102 XQ01',
 u'cf_std_FV 32903 XQ01',
 u'cf_kurt_CP 32905 XQ01',
 'HK 32408 XQ01',
 u'cf_max_CQ 32501_1 XQ01',
 'FV 32904 XQ01',
 'CP 32202 XQ01',
 u'cf_kurt_HK 32306 XQ01',
 u'cf_kurt_CT 32214 XQ01',
 'CT 32213 XQ01',
 u'cf_kurt_CF 32703 XQ01',
 u'cf_kurt_CP 32936 XQ01',
 u'cf_skew_W 32101 S XQ21',
 u'cf_kurt_VM 32904 XQ01',
 u'cf_skew_HK 32308 XQ01',
 u'cf_max_CP 32203 XQ01',
 u'cf_std_CQ 32505 XQ01',
 u'cf_kurt_F 32409 S XQ21',
 u'cf_kurt_CP 32921 XQ01',
 'CT 32409 XQ01',
 u'cf_kurt_F 32918 S XQ21',
 u'cf_skew_CP 32100 XQ01',
 u'cf_kurt_CT 32901 XQ01',
 u'cf_mean_CT 32218 XQ01',
 'CP 32205 XQ01',
 'CQ 32201 XQ01',
 u'cf_mean_

In [None]:
X_train.loc[:,features[0]]

In [None]:
pipelines = train_all_models(pipelines,features[i],X_train, X_test, yl_train, yl_test,verbose=True)

In [115]:
df2 = campaign1[1]
df2 = df2.set_index('time')
df2.index = pd.to_datetime(df2.index)
X = df2.drop(sensors+alarms, axis=1)
yl = [df[i] for i in sensors]
mses = []
for i, y in enumerate(yl):
    actuals = y
    predictions = pipelines[i].predict(X)
    print("score:")
    print(pipelines[i].score(X, y))
    mse = mean_squared_error(actuals, predictions)
    mses.append(mse)
    print("MSE:")
    print(mse)
mses = np.array(mses)
print("average MSE:")
print(np.mean(mses))

score:
-0.000715835871543
MSE:
0.0155230170149
score:
-0.111711356875
MSE:
0.0102317670896
score:
-0.0368301640818
MSE:
0.0131677697966
score:
-4.62055153532
MSE:
0.184248029938
score:
-0.192173729819
MSE:
0.00678166215812
average MSE:
0.0459904491994


In [105]:
campaign2 = readData(test[0:2], train_features, alarms)

./data/test/data_2013_05_12.csv read successfully
./data/test/data_2013_05_13.csv read successfully


In [119]:
df3 = campaign2[0]
df3 = df3.set_index('time')
df3.index = pd.to_datetime(df3.index)
X = df3.drop(sensors+alarms, axis=1)

yl = [df[i] for i in sensors]
mses = []
for i, y in enumerate(yl):
    actuals = y
    predictions = pipelines[i].predict(X)
    print("score:")
    print(pipelines[i].score(X, y))
    mse = mean_squared_error(actuals, predictions)
    mses.append(mse)
    print("MSE:")
    print(mse)
mses = np.array(mses)
print("average MSE:")
print(np.mean(mses))

score:
0.818557822106
MSE:
0.00281451528367
score:
0.762466252672
MSE:
0.00218616996538
score:
0.695984824539
MSE:
0.00386100056097
score:
0.535155063282
MSE:
0.0152381422497
score:
0.649494501452
MSE:
0.00199384520583
average MSE:
0.0052187346531


In [101]:
df3[sensors].describe()

Unnamed: 0,CQ 32301 XQ01,CQ 32302 XQ01,CQ 32303 XQ01,CQ 32304 XQ01,CQ 32305 XQ01
count,86400.0,86400.0,86400.0,86400.0,86400.0
mean,-0.052628,0.258398,1.59455,-0.008808,1.243319
std,0.065752,0.041591,0.03413,0.049914,0.067281
min,-0.27,0.09,1.44,-0.21,1.05
25%,-0.09,0.21,1.59,-0.06,1.2
50%,-0.06,0.27,1.59,0.0,1.23
75%,0.0,0.3,1.62,0.03,1.26
max,0.09,0.36,1.74,0.12,1.47


In [119]:
feature_dict = models[0].booster().get_fscore()
feature_dict = sorted(feature_dict.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
feature_dict[0:30]

[('HK 32422 XQ01', 48),
 ('CP 32403 XQ01', 27),
 ('HK 32901 XQ01', 26),
 ('CQ 32502 XQ01', 26),
 ('CQ 32501_2 XQ01', 22),
 ('CT 32912 XQ01', 20),
 ('HK 32405 XQ01', 19),
 ('HK 32409 XQ01', 18),
 ('CT 32212 XQ01', 17),
 ('CT 32101 XQ01', 17),
 ('CP 32202 XQ01', 17),
 ('CP 32102 XQ01', 16),
 ('CT 32721 XQ01', 14),
 ('CF 32923 XQ01', 14),
 ('CQ 32505 XQ01', 14),
 ('CT 32409 XQ01', 13),
 ('CT 32410 XQ01', 13),
 ('CP 32100 XQ01', 13),
 ('HK 32416 XQ01', 13),
 ('CF 32703 XQ01', 13),
 ('HK 32413 XQ01', 12),
 ('CP 32903 XQ01', 12),
 ('CT 32213 XQ01', 12),
 ('FV 32902 XQ01', 12),
 ('CP 32922 XQ01', 12),
 ('HK 32424 XQ01', 12),
 ('CT 32207 XQ01', 12),
 ('CL 32303 XQ01', 12),
 ('HK 32407 XQ01', 12),
 ('CP 32418 XQ01', 11)]

# Spearmann Correlation

In [29]:
# example
stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])

SpearmanrResult(correlation=0.82078268166812329, pvalue=0.088587005313543812)