In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import random

In [None]:
# Constant
FEATURE_PER_DAY = 18
WINDOW_SIZE = 9
FEATURE_NUM = 18 * 9
DAY_PER_MONTH = 20
VALIDATION_SET = True

In [None]:
def rmse(y_hypo, y_hat):
    cost = (y_hat - y_hypo) ** 2
    cost = np.sum(cost) / len(y_hypo)
    return np.sqrt(cost)

In [None]:
# Read training data
df = pd.read_csv("ml-2018spring-hw1/train.csv", encoding="big5")

# Transform RAINFALL column to number
for row in range(10, len(df), FEATURE_PER_DAY):
    df.iloc[row, 3:] = pd.to_numeric(df.iloc[row, 3:], errors="coerce")

df.fillna(0, inplace=True)

In [None]:
# Data preprocessing
data = []

for i in range(FEATURE_PER_DAY):
    data.append([])

for index, row in df.iterrows():
    for item in range(3, 27):
        data[index%FEATURE_PER_DAY].append(row[item])

In [None]:
# Store features and labels, concat days to get more training data
x_data = []
y_data = []

# Set validation set
x_v_data = []
y_v_data = []

def validation(data):
    for element in data:
        if float(element) <= 0:
            return False
    return True

for i in range(len(data[0])):
    if i % 480 + WINDOW_SIZE < 480 and validation(data[9][i:i+WINDOW_SIZE+1]):
        vec = []
        for j in range(FEATURE_PER_DAY):
            for element in data[j][i:i+WINDOW_SIZE]:
                vec.append(float(element))
                
        if VALIDATION_SET and random.randint(1, 10) % 10 == 0:
            x_v_data.append(vec)
            y_v_data.append(float(data[9][i+WINDOW_SIZE]))
        else:  
            x_data.append(vec)
            y_data.append(float(data[9][i+WINDOW_SIZE]))
    
x_data = np.array(x_data)
y_data = np.array(y_data)
x_data = np.concatenate((np.ones((x_data.shape[0], 1)), x_data), axis=1)

if VALIDATION_SET:
    x_v_data = np.array(x_v_data)
    y_v_data = np.array(y_v_data)
    x_v_data = np.concatenate((np.ones((x_v_data.shape[0], 1)), x_v_data), axis=1)

In [None]:
# Feature scaling
def feature_scaling(x_data):
    old_err_state = np.seterr(divide='raise')
    ignored_states = np.seterr(**old_err_state)
    x_data = np.subtract(x_data, np.mean(x_data, axis=0))
    x_data = np.divide(x_data, np.std(x_data, axis=0))
    x_data = np.nan_to_num(x_data)
    return x_data

In [None]:
# x_data = feature_scaling(x_data)
# x_v_data = feature_scaling(x_v_data)

In [None]:
# Try with DNN
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from keras.datasets import mnist

In [None]:
model = Sequential()

model.add(Dense(input_dim=FEATURE_NUM+1, units=1, activation="linear"))
# model.add(Dense(units=1, activation="relu"))

model.compile(loss="mse", optimizer='adam')

model.fit(x_data, y_data, batch_size=100, epochs=2000)

In [None]:
result = model.evaluate(x_data, y_data)
print("Test acc:", result)

In [None]:
y_hypo = model.predict(x_v_data).flatten()
print(rmse(y_v_data, y_hypo))

In [None]:
# Compute test value
dt = pd.read_csv("ml-2018spring-hw1/test.csv", header=None, encoding="big5")

for row in range(10, len(dt), FEATURE_PER_DAY):
    dt.iloc[row, 2:] = pd.to_numeric(dt.iloc[row, 2:], errors="coerce")
    
dt.fillna(0, inplace=True)

x_test = []
y_test = []
TEST_ROW_SIZE, TEST_COLUMN_SIZE = dt.shape

for i in range(0, len(dt), FEATURE_PER_DAY):
    arr = np.array(dt.iloc[i:i+FEATURE_PER_DAY, TEST_COLUMN_SIZE-WINDOW_SIZE:TEST_COLUMN_SIZE], dtype=float)
    arr = np.insert(arr, 0, 1) # x0 for bias
    x_test.append(arr)

In [None]:
x_test = np.array(x_test)
y_test_nn = model.predict(x_test).flatten()

for i in range(len(y_test_nn)):
    if y_test_nn[i] < 0:
        y_test_nn[i] = 0

arr = [["id_" + str(i), y_test_nn[i]] for i in range(len(y_test_nn))]
dw = pd.DataFrame(arr, columns = ["id", "value"])
dw.to_csv("outputnn.csv", index=False)