# Overview
Time series prediction presents its own challenges which are different from
machine-learning problems.  Like many other classes of problems, it also
presents a number of special features which are common.

## Fetch the data:
http://s3.amazonaws.com/thedataincubator/coursedata/mldata/train.txt.gz

The columns of the data correspond to the
  - year
  - month
  - day
  - hour
  - temp
  - dew_temp
  - pressure
  - wind_angle
  - wind_speed
  - sky_code
  - rain_hour
  - rain_6hour
  - city

We will focus on using the temporal elements to predict the temperature.


In [1]:
test_data = [
    u"2000 01 01 00   -11   -72 10197   220    26     4     0     0 bos",
    u"2000 01 01 01    -6   -78 10206   230    26     2     0 -9999 bos",
    u"2000 01 01 02   -17   -78 10211   230    36     0     0 -9999 bos",
    u"2000 01 01 03   -17   -78 10214   230    36     0     0 -9999 bos",
    u"2000 01 01 04   -17   -78 10216   230    36     0     0 -9999 bos",
]

In [2]:
import gzip
import numpy as np
data_list = []
with gzip.open('train.txt.gz', 'rb') as f:
    for line in f:
        item = line.strip().split(' ')
        str_list = filter(None, item)
        str_list_new = str_list[0:5]
        city = str_list[12]
        int_list = [int(item) for item in str_list_new]
        int_list.append(city)
        data_list.append(int_list)
new_data_list = []
for item in data_list:
    if item[4]==-9999:
        item[4] = None
    new_data_list.append(item)

In [3]:
import pandas as pd

df = pd.DataFrame(new_data_list)
df.columns = ['year','month','day','hour','temp','city']

In [4]:
df_1 = pd.DataFrame(df.groupby(['city','month','hour'])['temp'].mean())
df_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp
city,month,hour,Unnamed: 3_level_1
bal,1,0,15.606469
bal,1,1,10.180108
bal,1,2,6.483871
bal,1,3,2.793011
bal,1,4,-0.454301


In [5]:
# Define the estimator

from sklearn.base import BaseEstimator, RegressorMixin
class MyEstimator(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass
        

    def fit(self,df):
        self.average = pd.DataFrame(df.groupby(['city','month','hour'])['temp'].mean())
        return self

    def predict(self, record):
        item = record.split()
        city = item[12]
        month = int(item[1])
        hour = int(item[3])
        self.prediction = self.average.loc[(city,month,hour)].values[0]
        return self.prediction

In [6]:
import dill
myestimator = MyEstimator()
q1estimator = myestimator.fit(df)
tsq1_estimator = dill.dump(q1estimator,open('tsq1','w'))
q1estimator.predict(test_data[4])

-19.091397849462364

# Question 2. Fourier Model

In [351]:
from scipy import fftpack

q2_data_list = []
with gzip.open('train.txt.gz', 'rb') as f:
    for line in f:
        item = line.strip().split(' ')
        str_list = filter(None, item)
        str_new = ' '.join(str_list[0:4])
        city = str_list[12]
        temp = int(str_list[4])
        if temp == -9999:
            temp = None
        l = [city,str_new,temp]
        q2_data_list.append(l)

df_q2 = pd.DataFrame(q2_data_list)
df_q2.columns = ["city",'date','temp']

In [352]:
df_q2['date'] = pd.to_datetime(df_q2['date'])
series = pd.Series(df_q2['temp'].values,index = df_q2['date'].values)
temps_df = pd.DataFrame()
temps_df['city'] = df_q2['city']
temps_df['Julian'] = series.index.to_julian_date()
temps_df['temp'] = df_q2['temp']
temps_df['sin(year)'] = np.sin(temps_df['Julian'] / 365.25 * 2 * np.pi)
temps_df['cos(year)'] = np.cos(temps_df['Julian'] / 365.25 * 2 * np.pi)

temps_df = temps_df.dropna(how='any')

In [372]:
from sklearn import linear_model
class fftestimator(BaseEstimator, RegressorMixin):
    def __init__(self,city):
        self.city = city
        self.clf = linear_model.RidgeCV(cv =2)
        pass
        

    def fit(self,temps_df):
        city_df = pd.DataFrame()
        city_df = temps_df.loc[temps_df['city'] == self.city]
        sin_year = city_df['sin(year)'].values.tolist()
        cos_year = city_df['cos(year)'].values.tolist()
        X_train = [list(item) for item in zip(sin_year,cos_year)]
        Y_train = city_df['temp'].values.tolist()
        self.q2 = self.clf.fit(X_train,Y_train)
        return self
    
    def predict(self, record):
        item = record.split()
        date_str = ' '.join(item[0:4])
        city = item[12]
        df1 = pd.DataFrame([[city,date_str]])
        df1.columns = ['city','date']
        df1['date'] = pd.to_datetime(date_str)
        series1 = pd.Series(df1['city'].values,index = df1['date'].values)
        df2 = pd.DataFrame()
        df2['Julian'] = series1.index.to_julian_date()
        df2['sin(year)'] = np.sin(df2['Julian'] / 365.25 * 2 * np.pi)
        df2['cos(year)'] = np.cos(df2['Julian'] / 365.25 * 2 * np.pi)
        sin_year = df2['sin(year)'].values.tolist()[0]
        cos_year = df2['cos(year)'].values.tolist()[0]
        value = self.q2.predict([sin_year,cos_year])[0]
        return value

In [370]:
city_list = ['bal','bos','chi','nyc','phi']
estimator_list = []
for city in city_list:
    q2estimator = fftestimator(city)
    estimator = q2estimator.fit(temps_df)
    estimator_list.append(estimator)
estimator_dict = dict(zip(city_list,estimator_list))

In [381]:
# test
record = "2011 12  30  20  -11   -72 10197   220    26     4     0     0 nyc"
city = record.split()[12]
estimator_dict[city].predict(record)

17.585740098490703