# Homework 3: Classification, Regression, and Other Prediction Models

Round DATETIME towards its nearest hour. (e.g. 2017/10/16 14:37:52 -> 2017/10/16 15:00:00)

```sql
UPDATE 逐時觀測
SET 逐時觀測.時間 = DATE_FORMAT(
  DATE_ADD(逐時觀測.時間, INTERVAL 30 MINUTE),
  '%Y-%m-%d %H:00:00'
)
```

```sql
UPDATE Power
SET Power.updateTime = DATE_FORMAT(
  DATE_ADD(Power.updateTime, INTERVAL 30 MINUTE),
  '%Y-%m-%d %H:00:00'
)
```

A naive way to generate dataset.

```sql
DROP PROCEDURE IF EXISTS dm.generate_dataset;
DELIMITER //
CREATE PROCEDURE dm.generate_dataset(from_time DATETIME, to_time DATETIME)
BEGIN
  DECLARE it INT;
  DECLARE temp DOUBLE;
  DECLARE output TEXT;
  DECLARE curr_time DATETIME;
  WHILE from_time <= to_time DO
    SET it = 0;
    SET output = '';
    SET curr_time = from_time;
    generate_record: WHILE it < 102 DO
      SET temp = NULL;
      IF it < 96 THEN
        SELECT 逐時觀測.溫度 INTO temp FROM 逐時觀測
          WHERE (逐時觀測.時間 = curr_time) AND (逐時觀測.測站 = 'BANQIAO,板橋')
          LIMIT 1;
      ELSE
        SELECT Power.northUsage INTO temp FROM Power
          WHERE (Power.updateTime = curr_time)
          LIMIT 1;
      END IF;
      IF temp IS NULL THEN
        SET output = '';
        LEAVE generate_record;
      END IF;
      SET it = it + 1;
      SET output = CONCAT(output, ',', CAST(temp AS CHAR));
      SET curr_time = DATE_ADD(curr_time, INTERVAL 1 HOUR);
    END WHILE;
    IF output <> '' THEN
      INSERT INTO Records VALUES (output);
    END IF;
    SET from_time = DATE_ADD(from_time, INTERVAL 1 HOUR);
  END WHILE;
END//
DELIMITER ;
SET @from_time = CAST('2016-07-03 01:00:00' AS DATETIME);
SET @to_time = CAST('2017-07-04 00:00:00' AS DATETIME);
CALL generate_dataset(@from_time, @to_time);
```

This is very inefficient because there are too many queries. Try a better approach.

First, find duplicate record times.

```sql
SELECT 逐時觀測.時間
FROM 逐時觀測
WHERE 逐時觀測.測站 = 'BANQIAO,板橋'
GROUP BY 逐時觀測.時間
HAVING COUNT(*) > 1
```

0 rows retrieved.

```sql
SELECT Power.updateTime, COUNT(*) AS cnt
FROM Power
GROUP BY Power.updateTime
HAVING cnt > 1
```

updateTime | cnt
--- | ---
2016-10-17 00:00:00 | 2
2017-05-20 08:00:00 | 3
2017-05-20 13:00:00 | 2
2017-05-30 22:00:00 | 2
2017-07-19 09:00:00 | 2
2017-07-24 01:00:00 | 2
2017-07-24 13:00:00 | 2
2017-08-21 20:00:00 | 2
2017-08-21 21:00:00 | 2

Remove them using the following query.

```sql
-- run 10 times
DELETE FROM Power
WHERE Power.updateTime IN (
  SELECT A.updateTime
  FROM (SELECT * FROM Power) AS A
  GROUP BY A.updateTime
  HAVING COUNT(*) > 1
) LIMIT 1
```

Store each record (96 history data and 6 prediction data) in one CSV file.  
Ignore the whole record if it contains missing data.

```sql
DROP PROCEDURE IF EXISTS dm.generate_dataset;
DELIMITER //
CREATE PROCEDURE dm.generate_dataset(from_time DATETIME, to_time DATETIME)
BEGIN
  DECLARE curr_time DATETIME DEFAULT from_time;
  SET @path = '/Users/ywpu/Repositories/data-mining/hw3/';
  generate_record: WHILE curr_time <= to_time DO
    SET @cnt1 = (
      SELECT COUNT(逐時觀測.溫度) FROM 逐時觀測
        WHERE (逐時觀測.時間 >= curr_time)
        AND (逐時觀測.時間 < DATE_ADD(curr_time, INTERVAL 4 DAY))
        AND (逐時觀測.測站 = 'BANQIAO,板橋')
    );
    SET @cnt2 = (
      SELECT COUNT(Power.northUsage) FROM Power
        WHERE (Power.updateTime >= DATE_ADD(curr_time, INTERVAL 4 DAY))
        AND (Power.updateTime < DATE_ADD(curr_time, INTERVAL '4 6' DAY_HOUR))
    );
    IF @cnt1 <> 96 OR @cnt2 <> 6 THEN
      SET curr_time = DATE_ADD(curr_time, INTERVAL 1 HOUR);
      ITERATE generate_record;
    END IF;
    SET @filename = CONCAT(@path, CAST(curr_time AS CHAR), '.csv');
    SET @query_string = CONCAT(
      "SELECT A.Y INTO OUTFILE '", @filename, "'
        FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
      FROM (
        (
          SELECT 逐時觀測.時間 AS X, 逐時觀測.溫度 AS Y FROM 逐時觀測
          WHERE (逐時觀測.時間 >= ?)
          AND (逐時觀測.時間 < DATE_ADD(?, INTERVAL 4 DAY))
          AND (逐時觀測.測站 = 'BANQIAO,板橋')
        ) UNION ALL (
          SELECT Power.updateTime AS X, Power.northUsage AS Y FROM Power
          WHERE (Power.updateTime >= DATE_ADD(?, INTERVAL 4 DAY))
          AND (Power.updateTime < DATE_ADD(?, INTERVAL '4 6' DAY_HOUR))
        ) ORDER BY X ASC
      ) A"
    );
    PREPARE stmt FROM @query_string;
    EXECUTE stmt USING curr_time, curr_time, curr_time, curr_time;
    DEALLOCATE PREPARE stmt;
    SET curr_time = DATE_ADD(curr_time, INTERVAL 1 HOUR);
  END WHILE;
END//
DELIMITER ;
SET @from_time = CAST('2016-07-03 01:00:00' AS DATETIME);
SET @to_time = CAST('2017-07-04 00:00:00' AS DATETIME);
CALL generate_dataset(@from_time, @to_time);
```

Combine all records into `dataset.csv`.

In [2]:
import os
with open('dataset.csv', 'w') as target:
    entries = os.listdir()
    for entry in entries:
        if entry.endswith('.csv') and entry.startswith('201'):
            with open(entry) as file:
                cols = file.readlines()
            cols = [col.strip() for col in cols]
            row = ','.join(cols) + '\n'
            target.write(row)