In [2]:
import os
from time import time
import random


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy import median

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import tree
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [3]:
data_dir = "/home/chsu6/scout/Data"
normalized_data_dir = "/home/chsu6/scout/NormalizedData"

file_list = os.listdir(data_dir)

In [4]:
def score_mape(y, y_predicted):
    return np.mean(abs(y-y_predicted)/(y+0.000001))

def get_score_function(name):
    if name == "mse":
        return mean_squared_error
    elif name == "mape":
        return score_mape

def random_strategy(select_percentage, df_candidate, df_test, score_func=get_score_function("mse")):
    select_index = random.sample(list(df_candidate.index), max(5, int(select_percentage*len(df_candidate.index))))    
    df_training = df_candidate.ix[select_index, :]
    clf = tree.DecisionTreeRegressor()
    clf.fit(df_training.ix[:, :-1], df_training.ix[:, -1])
    test_predicted = clf.predict(df_test.ix[:, :-1])
    return (score_func(df_test.ix[:, -1], test_predicted), select_index)

def brute_force_random_strategy(select_percentage, df_candidate, df_test, num_iter=1000, score_name="mse"):
    score_func = get_score_function(score_name)
    score_records = []
    index_records = {}
    for i in range(num_iter):
        (score, select_index) = random_strategy(select_percentage, df_candidate, df_test, score_func=score_func)
        score_records.append(score)
        index_records[i] = select_index
    
    return (score_records, index_records)
    
def create_filtered_df(file_path):
    df = pd.read_csv(file_path)
    return df[df.ix[:, -1] != 0]

In [28]:
class data_item():
    def __init__(self, id, decisions, objective):
        self.id = id
        self.decisions = decisions
        self.objective = objective

    def __repr__(self):
        return str(self.id)+ "|" +",".join(map(str, self.decisions)) + "|" + str(self.objective)
    
def read_csv(filename, header=False):

    import csv
    data = []
    f = open(filename, 'r')
    reader = csv.reader(f)
    for i,row in enumerate(reader):
        if i == 0 and header is False: continue  # Header
        elif i ==0 and header is True:
            H = row
            continue
        data.append(data_item(i, map(float, row[:-1]), float(row[-1])))
    f.close()
    if header is True: return H, data
    return data

def model_cart(training_indep, training_dep, testing_indep, testing_dep):
    from sklearn import tree
    CART = tree.DecisionTreeRegressor()
    CART = CART.fit(training_indep, training_dep)

    predictions = [float(x) for x in CART.predict(testing_indep)]
    mre = []
    for i, j in zip(testing_dep, predictions):
        if i != 0:
            mre.append(abs(i - j) / float(i))  # abs(original - predicted)/original
        else:
            if i==j: mre.append(0)

    return mre

def where_data_transformation(filename):
    from Utilities.WHERE.where import where
    # The Raw_Data has to be access using this attribute table._rows.cells
    import pandas as pd
    df = pd.read_csv(filename)
    headers = [h for h in df.columns if '$<' not in h]
    data = df[headers]
    clusters = where(data)

    return clusters

def experiment(filename, test_independet, test_dependent, sampling=None, nop=1):
    content = read_csv(filename)
    len_data = len(content)
    content_dict = {}
    for c in content:
        key = ",".join(map(str, c.decisions))
        content_dict[key] = float(c.objective)

    clusters = where_data_transformation(filename)

    train_independent = []
    train_dependent = []

    test_independent = []
    test_dependent = []
    for cluster in clusters:
        indexes = range(len(cluster))
        random_point_indexes = random.choice(range(len(cluster)), nop)
        for random_point_index in random_point_indexes:
            train_independent.append(cluster[random_point_index])

            key = ",".join(map(str, map(float, cluster[random_point_index])))
            train_dependent.append(content_dict[key])

            # remove the training sample from test set
            del indexes[random_point_index]
    
    mre = model_cart(train_independent, train_dependent, test_independent, test_dependent)
    from numpy import median
    return round( mean(mre), 3)

In [29]:
score_name = "mape"
for f in sorted(file_list):
    print("[{}]".format(f))
    df = create_filtered_df(os.path.join(data_dir, f))
    candidate_percentage = 0.6
    index_candidate = random.sample(list(df.index), int(candidate_percentage*len(df.index)))
    index_test = [row for row in df.index if row not in index_candidate]
    assert len(index_candidate) + len(index_test) == len(df)
    f_tmp = "/tmp/abc.csv"
    df.ix[index_candidate].to_csv(f_tmp, index=False)
    experiment(f_tmp, df.ix[index_test, :-1], df.ix[index_test, -1])
    break
    num_iter = 1000
    score_records = brute_force_random_strategy(0.1, df.ix[index_candidate, :], df.ix[index_test, :], num_iter=num_iter, score_name=score_name)
    #for score in score_records:
    #    print("\t{}={:.4f}".format(score_name.upper(), score))
    print("\t* The min {}={:.4f}".format(score_name.upper(), min(score_records)))
    print("\t* The max {}={:.4f}".format(score_name.upper(), max(score_records)))
    print("\t* The mean {}={:.4f}".format(score_name.upper(), np.mean(score_records)))
    print("\t* The median {}={:.4f}".format(score_name.upper(), np.median(score_records)))
    print("\t* The std {}={:.4f}".format(score_name.upper(), np.std(score_records)))

[1_tp_read.csv]


ImportError: No module named 'Utilities'

In [27]:
import sys
sys.path.append('.')