# Jupyter and Data Prep Methods
Author: Hsueh-Hung Cheng (AndrewID: hsuehhuc)


## Environment
- Python 2.7
- No NumPy is required

## Modules
Import modules required in this assignment

In [38]:
import collections
import datetime
import math
import time
import os
from os.path import join
from IPython.lib import kernel

## Configuration
Please update the code below if necessary. **The one you have to modify is the path of input folder (variable named FXDATA_PATH)**

In [39]:
# Configuration
FXDATA_PATH = "/Users/xuehung/Downloads/FXData2014"
YEAR = 2014
ANDREW_ID = "hsuehhuc"
INTERPOLATION_METHOD = ["linear", "cosin", "cubic"]
OUTPUT_PATH = os.getcwd()

# Global variables
EPOCH = datetime.datetime(1970,1,1)

## Data Synthesis / Shaping Section
The code below will read input files

In [40]:
# Process the data in a single file
def process_month(input_filepath, matrix):
    with open(input_filepath) as fin:
        for line in fin:
            try:
                # Only use bid
                currency, timestamp, bid, ask = line.strip().split(',')
                bid = float(bid)
                date, time = timestamp.strip().split()
                hour, minute, second = time.split(":")
                minute = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8]), int(hour), int(minute))
                minute = int((minute - EPOCH).total_seconds() / 60)

                if minute not in matrix:
                    matrix[minute] = [bid, bid, bid]
                else:
                    matrix[minute][0] = max(matrix[minute][0], bid)
                    matrix[minute][1] = min(matrix[minute][1], bid)
                    matrix[minute][2] = bid
            except Exception, e:
                print "Invalid inpute data: %s (%s)" % (line, str(e))
    return

# Process all files given the currency
def process_currency(currency, input_folder_path):
    fx_files = sorted([f for f in os.listdir(input_folder_path) if f.endswith(".csv")])
    # output_filepath = join(input_folder_path, "%s_result.csv" % currency)

    # Key is time range start
    # Element is a list [max, min, close]
    matrix = {}
    # Update the matrix by iterating csv files
    for filename in fx_files:
        print "Processing file %s" % filename
        process_month(join(input_folder_path, filename), matrix)
    return matrix

currency_list = [] # The currecies to process
matrix_list = [] # The variable to store statistics data                                                                                                                      

# Get all directories in the input folder
folders = [ f for f in os.listdir(FXDATA_PATH) if os.path.isdir(join(FXDATA_PATH, f))]

folders = folders[:1]

for currency in folders:
    print "=== Processing currencies %s" % currency
    matrix = process_currency(currency, join(FXDATA_PATH, currency))
    matrix_list.append(matrix)
    currency_list.append(currency)

=== Processing currencies AUDJPY
Processing file AUDJPY-2014-01.csv


## Data Description Section
The below function will do the statistics and return all numbers in a list.

In [41]:
def get_data_description(matrix):
    mean = [0, 0, 0]
    g_mean =[1, 1, 1]
    h_mean = [0, 0, 0]
    mode = [0, 0, 0]
    median = [0, 0, 0]
    variance = [0, 0, 0]

    n = len(matrix)
    r_n = 1 / float(n)
    for value in matrix.itervalues():
        for i in range(3):
            mean[i] = mean[i] + value[i]
            g_mean[i] = g_mean[i] * pow(value[i], r_n)
            h_mean[i] = h_mean[i] + 1 / value[i]
            variance[i] = variance[i] + (value[i] * value[i]) / n
    mean = map(lambda x : x / n, mean)
    h_mean = map(lambda x : n / x, h_mean)
    for i in range(3):
        variance[i] = variance[i] - (mean[i] * mean[i])
    sigma = map(lambda x : pow(x, 0.5), variance)

    for i in range(3):
        arr = sorted(map(lambda x : x[i], matrix.itervalues()))
        mode[i] = collections.Counter(arr).most_common(1)[0][0]
        median[i] = arr[n/2] if n % 2 == 1 else (arr[n/2] + arr[n/2 - 1]) / 2

    return mean + g_mean + h_mean + mode + median + mean + sigma

In [42]:
data_description_filepath = join(OUTPUT_PATH, "data_description_%s.csv" % ANDREW_ID)
with open(data_description_filepath, "w") as fout:
    title = ["Mean", "GeometricMean", "HarmonicMean", "Mode", "Median", "Mu", "Sigma"]
    price = ["Low", "High", "Close"]
    fout.write("Currency")
    for t in title:
        for p in price:
            fout.write("\t%s(%s)" % (t, p))
    fout.write("\n")
    for i in range(len(currency_list)):
        fout.write(currency_list[i] + "\t")
        data = get_data_description(matrix_list[i])
        fout.write("\t".join(map(lambda x : str(x), data)) + "\n")
print "Data Description_ has been written to %s" % data_description_filepath

Data Description_ has been written to /Users/xuehung/xuehng/cmu/11675/hw2/data_description_hsuehhuc.csv


## Data Interpolation Section
Firstly three interpolation functions are defined

In [43]:
def linear_interpolate(y1, y2, mu):
    mu = float(mu)
    return y1 * (1 - mu) + y2 * mu

def cosin_interpolate(y1, y2, mu):
    mu = float(mu)
    mu2 = (1 - math.cos(mu * math.pi)) / 2;
    return y1 * (1 - mu2) + y2 * mu2

def cubic_interpolate(y0, y1, y2, y3, mu):
    mu = float(mu)
    mu2 = mu * mu
    a0 = y3 - y2 - y0 + y1;
    a1 = y0 - y1 - a0;
    a2 = y2 - y0;
    a3 = y1;
    return a0 * mu * mu2 + a1 * mu2 + a2 * mu + a3

def interpolate(method, _matrix):
    interpolate_func = None
    # Determine which interpolation method is used
    if method == "linear":
        interpolate_func = linear_interpolate
    elif method == "cosin":
        interpolate_func = cosin_interpolate
    elif method == "cubic":
        interpolate_func = cubic_interpolate
    else:
        raise Exception("Unknown Interpolation Method")

    matrix = _matrix.copy()

    keys = sorted(matrix.keys())
    klen = len(keys)

    # Interate all timestamps
    for i in range(klen - 1):
        t1 = keys[i]
        t2 = keys[i + 1]
        if t2 - t1 == 1:
            continue
        # Do the interpolation
        for t in range(t1 + 1, t2):
            mu = (t - t1) / float(t2 - t1)
            # Initialize the list
            matrix[t] = [0, 0, 0]
            for j in range(3):
                y1 = matrix[t1][j]
                y2 = matrix[t2][j]
                if method != "cubic":
                    matrix[t][j] = interpolate_func(y1, y2, mu)
                elif i != 0 and i + 2 < klen:
                    y0 = matrix[keys[i - 1]][j]
                    y3 = matrix[keys[i + 2]][j]
                    matrix[t][j] = interpolate_func(y0, y1, y2, y3, mu)
    return matrix

Secondly, apply the above interpolation functions on the data. And write the output to files

In [44]:
# Run each interpolation method
for interpolate_method in INTERPOLATION_METHOD:
    print "Interpolate data with %s interpolation" % interpolate_method
    interpolated_matrix = map(lambda x: interpolate(interpolate_method, matrix), matrix_list)
    # Output to files
    output_file_path = join(OUTPUT_PATH, "%s_result_%s.csv" % (interpolate_method, ANDREW_ID))
    with open(output_file_path, "w") as fout:
        # Find the first and last minutes
        start_min = int((datetime.datetime(YEAR, 1, 1, 0, 0) - EPOCH).total_seconds() / 60)
        end_min = int((datetime.datetime(YEAR, 12, 31, 23, 59) - EPOCH).total_seconds() / 60)
        fout.write("Time\t")
        for currency in currency_list:
            fout.write("Min(%s)\tMax(%s)\tFinal(%s)\t" % (currency, currency, currency))
        fout.write("\n")
        while start_min <= end_min:
            timestamp = datetime.datetime.utcfromtimestamp(start_min * 60).strftime('%Y-%m-%d %H:%M')
            fout.write("%s\t" % timestamp)
            # For each currency
            for i in range(len(currency_list)):
                if start_min in interpolated_matrix[i]:
                    data = interpolated_matrix[i][start_min]
                    fout.write("%f\t%f\t%f\t" % (data[0], data[1], data[2]))
                else:
                    fout.write("N/A\tN/A\tN/A\t")
            fout.write("\n")
            start_min = start_min + 1
        print "Output has been written to %s" % output_file_path

Interpolate data with linear interpolation
Output has been written to /Users/xuehung/xuehng/cmu/11675/hw2/linear_result_hsuehhuc.csv
Interpolate data with cosin interpolation
Output has been written to /Users/xuehung/xuehng/cmu/11675/hw2/cosin_result_hsuehhuc.csv
Interpolate data with cubic interpolation
Output has been written to /Users/xuehung/xuehng/cmu/11675/hw2/cubic_result_hsuehhuc.csv
