In [2]:
import os
from time import time
import random
import datetime

import scipy.stats as ss

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy import median

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import tree
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [3]:
data_dir = "/home/chsu6/scout/Data"
normalized_data_dir = "/home/chsu6/scout/NormalizedData"
solution_dir = "/home/chsu6/scout/solution"


file_list = os.listdir(data_dir)

In [4]:
def score_mape(y, y_predicted):
    return np.mean(abs(y-y_predicted)/(y+0.000001))

def get_score_function(name):
    if name == "mse":
        return mean_squared_error
    elif name == "mape":
        return score_mape

def random_strategy(select_percentage, df_candidate, df_test, score_func=get_score_function("mse")):
    select_index = random.sample(list(df_candidate.index), max(5, int(select_percentage*len(df_candidate.index))))    
    df_training = df_candidate.ix[select_index, :]
    clf = tree.DecisionTreeRegressor()
    clf.fit(df_training.ix[:, :-1], df_training.ix[:, -1])
    test_predicted = clf.predict(df_test.ix[:, :-1])
    return (score_func(df_test.ix[:, -1], test_predicted), select_index)

def brute_force_random_strategy(select_percentage, df_candidate, df_test, num_iter=1000, score_name="mse"):
    score_func = get_score_function(score_name)
    score_records = []
    index_records = {}
    for i in range(num_iter):
        (score, select_index) = random_strategy(select_percentage, df_candidate, df_test, score_func=score_func)
        score_records.append(score)
        index_records[i] = select_index
    
    return (score_records, index_records)
    
def create_filtered_df(file_path):
    df = pd.read_csv(file_path)
    return df[df.ix[:, -1] != 0]

In [10]:
score_name = "mape"
output_dir = os.path.join(solution_dir, "test1")
for f in sorted(file_list):
    
    print("[{}]".format(f))
    
    # initialize setting
    run_output_dir = os.path.join(output_dir, f.split(".")[0])
    original_file = os.path.join(run_output_dir, "full.csv")
    candidate_file = os.path.join(run_output_dir, "candidate.csv")
    testing_file = os.path.join(run_output_dir, "testing.csv")
    statistics_file = os.path.join(run_output_dir, "statistics.txt")
    os.makedirs(run_output_dir, exist_ok=True)
    candidate_percentage = 0.6
    select_percentage = 0.1
    num_iter = 10000
    
    # initialize data
    df = create_filtered_df(os.path.join(data_dir, f))
    index_candidate = random.sample(list(df.index), int(candidate_percentage*len(df.index)))
    index_test = [row for row in df.index if row not in index_candidate]
    assert len(index_candidate) + len(index_test) == len(df)
    df.ix[index_candidate, :].to_csv(candidate_file, index=False)
    df.ix[index_test, :].to_csv(testing_file, index=False)
    
    # random sampling
    num_top_score = 10
    (score_records, index_records) = brute_force_random_strategy(select_percentage, df.ix[index_candidate, :], df.ix[index_test, :], num_iter=num_iter, score_name=score_name)
    score_ranks = ss.rankdata(score_records)
    best_n_index = np.argpartition(score_records, num_top_score)[:num_top_score] 
    worst_n_index = np.argpartition(score_records, -num_top_score)[-num_top_score:] 
    
    # best cases
    for _ in best_n_index:
        best_index = score_ranks[_]
        output = os.path.join(run_output_dir, "best_{}_{:.4f}.csv".format(int(best_index), score_records[_]))
        df.ix[index_records[_], :].to_csv(output, index=False)
    
    # worst vases
    for _ in worst_n_index:
        worst_index = score_ranks[_]
        output = os.path.join(run_output_dir, "worst_{}_{:.4f}.csv".format(len(score_ranks)-int(worst_index)+1, score_records[_]))
        df.ix[index_records[_], :].to_csv(output, index=False)
    
    # write files
    df.to_csv(original_file, index=False)
    
    # statistics
    with open(statistics_file, "w") as output_file:
        output_file.write("total_points={}\n".format(len(df)))
        output_file.write("candidate_points={}\n".format(len(index_candidate)))
        output_file.write("testing_points={}\n".format(len(index_test)))
        output_file.write("candidate_percentage={}\n".format(candidate_percentage))
        output_file.write("select_percentage={}\n".format(select_percentage))
        output_file.write("num_iter={}\n".format(num_iter))
        output_file.write("\n")
        
        output_file.write("MIN={:.4f}\n".format(min(score_records)))
        output_file.write("MAX={:.4f}\n".format(max(score_records)))
        output_file.write("MEAN={:.4f}\n".format(np.mean(score_records)))
        output_file.write("MEDIAN={:.4f}\n".format(np.median(score_records)))
        output_file.write("STD={:.4f}\n".format(np.std(score_records)))
        output_file.write("\n")
        for score in sorted(score_records):
            output_file.write("{:.4f}".format(score))
            output_file.write("\n")
    
    print("\t* The min {}={:.4f}".format(score_name.upper(), min(score_records)))
    print("\t* The max {}={:.4f}".format(score_name.upper(), max(score_records)))
    print("\t* The mean {}={:.4f}".format(score_name.upper(), np.mean(score_records)))
    print("\t* The median {}={:.4f}".format(score_name.upper(), np.median(score_records)))
    print("\t* The std {}={:.4f}".format(score_name.upper(), np.std(score_records)))

In [9]:
datetime.datetime.now().strftime("%Y%m%d")

'20160324'