In [145]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import random

In [146]:
# Constant
FEATURE_PER_DAY = 18
WINDOW_SIZE = 9
FEATURE_NUM = 18 * 9
DAY_PER_MONTH = 20

In [147]:
# Read training data
df = pd.read_csv("competitions/ml-2018spring-hw1/train.csv", encoding="big5")

# Transform RAINFALL column to number
for row in range(10, len(df), FEATURE_PER_DAY):
    df.iloc[row, 3:] = pd.to_numeric(df.iloc[row, 3:], errors="coerce")

df.fillna(0, inplace=True)

In [148]:
# Data preprocessing
data = []

for i in range(FEATURE_PER_DAY):
    data.append([])

for index, row in df.iterrows():
    for item in range(3, 27):
        data[index%FEATURE_PER_DAY].append(row[item])

In [149]:
# Store features and labels, concat days to get more training data
x_data = []
y_data = []

# Set validation set
x_v_data = []
y_v_data = []

def validation(data):
    for element in data:
        if float(element) <= 0:
            return False
    return True

for i in range(len(data[0])):
    if i % 480 + WINDOW_SIZE < 480 and validation(data[9][i:i+WINDOW_SIZE+1]):
        vec = []
        for j in range(FEATURE_PER_DAY):
            for element in data[j][i:i+WINDOW_SIZE]:
                vec.append(float(element))
        vec.insert(0, 1)
        if random.randint(1, 10) % 10 == 0:
            x_v_data.append(vec)
            y_v_data.append(float(data[9][i+WINDOW_SIZE]))
        else:  
            x_data.append(vec)
            y_data.append(float(data[9][i+WINDOW_SIZE]))
    
x_data = np.array(x_data)
y_data = np.array(y_data)

x_v_data = np.array(x_v_data)
y_v_data = np.array(y_v_data)

In [150]:
# Store features and labels, one day 16 training data
# x_data = []
# y_data = []
# ROW_SIZE, COLUMN_SIZE = df.shape

# for i in range(0, len(df), FEATURE_PER_DAY):
#     j = 3
#     while float(df.iloc[i+9, j+WINDOW_SIZE]) > 0 and j + WINDOW_SIZE < COLUMN_SIZE - 1:
#         arr = np.array(df.iloc[i:i+FEATURE_PER_DAY, j:j+WINDOW_SIZE], dtype=float)
#         arr = np.insert(arr, 0, 1) # x0 for bias
#         x_data.append(arr)
#         y_data.append(np.array(df.iloc[i+9, j+WINDOW_SIZE], dtype=float))
#         j += 1
        
# x_data = np.array(x_data)
# y_data = np.array(y_data)

In [151]:
# Initial model
theta = np.array([0.0] * (FEATURE_NUM + 1)) # all parameters init to 0
lr_ada = np.zeros(FEATURE_NUM + 1)
lr = 10
iteration = 10000

In [152]:
# Start at closed-form point
# theta_c = np.linalg.lstsq(x_data, y_data)
# theta = np.array(theta_c[0])

In [153]:
# Load initial parameters from npy file
# theta = np.load("theta.npy")

In [154]:
# Find the best function, using numpy matrix computation
from tqdm import tqdm
x_data_t = x_data.transpose()

for i in tqdm(range(iteration)):
    dot = np.dot(x_data, theta)
    loss = y_data - dot
    grad = np.dot(x_data_t, loss) * (-2.0)
    lr_ada += grad ** 2
    theta = theta - lr / np.sqrt(lr_ada) * grad

100%|██████████| 10000/10000 [00:05<00:00, 1714.82it/s]


In [155]:
# Verify model by compute its error
def rmse(x_data, y_data, theta):
    cost = (np.dot(x_data, theta) - y_data) ** 2
    cost = np.sum(cost) / len(x_data)
    return np.sqrt(cost)

In [156]:
# Save model to npy file
# np.save("theta", theta)

In [157]:
# Compute test value
dt = pd.read_csv("competitions/ml-2018spring-hw1/test.csv", header=None, encoding="big5")

for row in range(10, len(dt), FEATURE_PER_DAY):
    dt.iloc[row, 2:] = pd.to_numeric(dt.iloc[row, 2:], errors="coerce")
    
dt.fillna(0, inplace=True)

x_test = []
y_test = []
TEST_ROW_SIZE, TEST_COLUMN_SIZE = dt.shape

for i in range(0, len(dt), FEATURE_PER_DAY):
    arr = np.array(dt.iloc[i:i+FEATURE_PER_DAY, TEST_COLUMN_SIZE-WINDOW_SIZE:TEST_COLUMN_SIZE], dtype=float)
    arr = np.insert(arr, 0, 1) # x0 for bias
    x_test.append(arr)
    
for i in range(len(x_test)):
    y_test.append(np.dot(theta, x_test[i]))

# Replace negative value
for i in range(len(y_test)):
    if y_test[i] < 0:
        y_test[i] = 0

arr = [["id_" + str(i), y_test[i]] for i in range(len(y_test))]
dw = pd.DataFrame(arr, columns = ["id", "value"])
dw.to_csv("output.csv", index=False)

In [158]:
# closed-form solution
theta_c = np.linalg.lstsq(x_data, y_data)
theta_c = np.array(theta_c[0])

  


In [159]:
print("Training set:")
print(rmse(x_data, y_data, theta))
print(rmse(x_data, y_data, theta_c))
print("Validation set:")
print(rmse(x_v_data, y_v_data, theta))
print(rmse(x_v_data, y_v_data, theta_c))

Training set:
12.608772914400182
12.449777159858373
Validation set:
5.6815592201032254
8.885413771256003
