In [2]:
import os
from time import time
import random


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy import median

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import tree
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [3]:
data_dir = "/home/chsu6/scout/Data"
normalized_data_dir = "/home/chsu6/scout/NormalizedData"

file_list = os.listdir(data_dir)

In [4]:
def score_mape(y, y_predicted):
    return np.mean(abs(y-y_predicted)/(y+0.000001))

def get_score_function(name):
    if name == "mse":
        return mean_squared_error
    elif name == "mape":
        return score_mape

def random_strategy(select_percentage, df_candidate, df_test, score_func=get_score_function("mse")):
    select_index = random.sample(list(df_candidate.index), max(5, int(select_percentage*len(df_candidate.index))))    
    df_training = df_candidate.ix[select_index, :]
    clf = tree.DecisionTreeRegressor()
    clf.fit(df_training.ix[:, :-1], df_training.ix[:, -1])
    test_predicted = clf.predict(df_test.ix[:, :-1])
    return score_func(df_test.ix[:, -1], test_predicted)

def brute_force_random_strategy(select_percentage, df_candidate, df_test, num_iter=1000, score_name="mse"):
    score_func = get_score_function(score_name)
    score_records = []
    for i in range(num_iter):
        score = random_strategy(select_percentage, df_candidate, df_test, score_func=score_func)
        score_records.append(score)
    return score_records
    
def create_filtered_df(file_path):
    df = pd.read_csv(file_path)
    return df[df.ix[:, -1] != 0]

In [8]:
score_name = "mape"
for f in sorted(file_list):
    print("[{}]".format(f))
    df = create_filtered_df(os.path.join(data_dir, f))
    candidate_percentage = 0.6
    index_candidate = random.sample(list(df.index), int(candidate_percentage*len(df.index)))
    index_test = [row for row in df.index if row not in index_candidate]
    assert len(index_candidate) + len(index_test) == len(df)
    
    num_iter = 10
    score_records = brute_force_random_strategy(0.1, df.ix[index_candidate, :], df.ix[index_test, :], num_iter=num_iter, score_name=score_name)
    #for score in score_records:
    #    print("\t{}={:.4f}".format(score_name.upper(), score))
    print("\t* The min {}={:.4f}".format(score_name.upper(), min(score_records)))
    print("\t* The max {}={:.4f}".format(score_name.upper(), max(score_records)))
    print("\t* The mean {}={:.4f}".format(score_name.upper(), np.mean(score_records)))
    print("\t* The median {}={:.4f}".format(score_name.upper(), np.median(score_records)))
    print("\t* The std {}={:.4f}".format(score_name.upper(), np.std(score_records)))

[1_tp_read.csv]
	* The min MAPE=0.4201
	* The max MAPE=4.4720
	* The mean MAPE=1.9148
	* The median MAPE=1.8062
	* The std MAPE=1.1162
[2_tp_write.csv]
	* The min MAPE=0.1535
	* The max MAPE=0.2849
	* The mean MAPE=0.2052
	* The median MAPE=0.1941
	* The std MAPE=0.0425
[3_tp_read.csv]
	* The min MAPE=0.4516
	* The max MAPE=0.7822
	* The mean MAPE=0.6170
	* The median MAPE=0.6073
	* The std MAPE=0.1157
[4_tp_write.csv]
	* The min MAPE=0.3415
	* The max MAPE=0.6004
	* The mean MAPE=0.4278
	* The median MAPE=0.3949
	* The std MAPE=0.0750
[ds101_ops_read.csv]
	* The min MAPE=0.3775
	* The max MAPE=0.9486
	* The mean MAPE=0.6030
	* The median MAPE=0.5122
	* The std MAPE=0.2131
[ds101_ops_write.csv]
	* The min MAPE=0.2239
	* The max MAPE=0.4438
	* The mean MAPE=0.3046
	* The median MAPE=0.2862
	* The std MAPE=0.0739
[ds101_rt_read.csv]
	* The min MAPE=0.3179
	* The max MAPE=0.5202
	* The mean MAPE=0.4142
	* The median MAPE=0.4187
	* The std MAPE=0.0701
[ds101_rt_write.csv]


	* The min MAPE=0.1044
	* The max MAPE=0.3388
	* The mean MAPE=0.1952
	* The median MAPE=0.1735
	* The std MAPE=0.0665
[ds101_tp_read.csv]
	* The min MAPE=0.3395
	* The max MAPE=3.8462
	* The mean MAPE=1.1758
	* The median MAPE=0.6471
	* The std MAPE=1.0705
[ds101_tp_write.csv]
	* The min MAPE=0.2427
	* The max MAPE=0.4926
	* The mean MAPE=0.3610
	* The median MAPE=0.3630
	* The std MAPE=0.0834
