In [1]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import gzip
import grader

# Time Series Data: Predict Temperature
Time series prediction presents its own challenges which are different from machine-learning problems.  As with many other classes of problems, there are a number of common features in these predictions.

## A note on scoring
It **is** possible to score >1 on these questions. This indicates that you've beaten our reference model - we compare our model's score on a test set to your score on a test set. See how high you can go!

## Fetch the data:

In [3]:
!aws s3 sync s3://dataincubator-course/mldata/ . --exclude '*' --include 'train.txt.gz'

download: s3://dataincubator-course/mldata/train.txt.gz to ./train.txt.gz


The columns of the data correspond to the
  - year
  - month
  - day
  - hour
  - temp
  - dew_temp
  - pressure
  - wind_angle
  - wind_speed
  - sky_code
  - rain_hour
  - rain_6hour
  - city

This function will read the data from a file handle into a Pandas DataFrame.  Feel free to use it, or to write your own version to load it in the format you desire.

In [4]:
def load_stream(stream):
    return pd.read_table(stream, sep=' *',
                         names=['year', 'month', 'day', 'hour', 'temp',
                                'dew_temp', 'pressure', 'wind_angle', 
                                'wind_speed', 'sky_code', 'rain_hour',
                                'rain_6hour', 'city'])

In [5]:
# df = load_stream(gzip.open('train.txt.gz', 'r'))

  


The temperature is reported in tenths of a degree Celcius.  However, not all the values are valid.  Examine the data, and remove the invalid rows.

In [6]:
import gzip
with gzip.open('train.txt.gz', 'rb') as f:
    content = f.readlines()

In [7]:
import re
for line in content:
    re.sub('\n','',line)
import pandas as pd
data = pd.read_csv('train.txt.gz',sep=r"\s*",header=None)
data.columns = ['year','month','day','hour','temp','dew_temp','pressure','wind_angle','wind_speed','sky_code','rain_hour','rain_6hour','city']


  """


In [32]:
data.head(1)

Unnamed: 0,year,month,day,hour,temp,dew_temp,pressure,wind_angle,wind_speed,sky_code,rain_hour,rain_6hour,city,time
0,2000,1,1,0,-11,-72,10197,220,26,4,0,0,bos,2000-01-01


In [105]:
from datetime import datetime

data['time'] = data.apply(lambda x:datetime.strptime("{0} {1} {2} {3}:00:00".format(x['year'],x['month'], x['day'],x['hour']), "%Y %m %d %H:%M:%S"),axis=1)

In [106]:
len(data['time'])

525869

In [110]:
data.loc[1]

year                         2000
month                           1
day                             1
hour                            1
temp                           -6
dew_temp                      -78
pressure                    10206
wind_angle                    230
wind_speed                     26
sky_code                        2
rain_hour                       0
rain_6hour                  -9999
city                          bos
time          2000-01-01 01:00:00
Name: 1, dtype: object

In [10]:
df = data[['time','temp','dew_temp','pressure','wind_angle','wind_speed','sky_code','rain_hour','rain_6hour','city']]

In [33]:
df.head(1)

Unnamed: 0,time,temp,dew_temp,pressure,wind_angle,wind_speed,sky_code,rain_hour,rain_6hour,city
0,2000-01-01,-11,-72,10197,220,26,4,0,0,bos


In [124]:
df_city = data.groupby('city')#['month','day','year','hour'].mean()

# df_city.head(1)

In [126]:
len(df_city.groups)

5

In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import pipeline

We will focus on using the temporal elements to predict the temperature.

## Per city model

It makes sense for each city to have it's own model.  Build a "groupby" estimator that takes an estimator factory as an argument and builds the resulting "groupby" estimator on each city.  That is, `fit` should create and fit a model per city, while the `predict` method should look up the corresponding model and perform a predict on each.  An estimator factory is something that returns an estimator each time it is called.  It could be a function or a class.

In [4]:
from sklearn import base

class GroupbyEstimator(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self, column, estimator_factory):
        # column is the value to group by; estimator_factory can be
        # called to produce estimators
    
    def fit(self, X, y):
        # Create an estimator and fit it with the portion in each group
        return self

    def predict(self, X):
        # Call the appropriate predict method for each row of X
        return ...

IndentationError: expected an indented block (<ipython-input-4-c912183c1e77>, line 9)

In [78]:
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelectTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        import numpy as np
        import pandas as pd
        pass
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        import numpy as np
        import pandas as pd
        if X == 'train.txt.gz':
            data = pd.read_csv(X,sep=r"\s*",header=None)
            data.columns = ['year','month','day','hour','temp','dew_temp','pressure','wind_angle','wind_speed','sky_code','rain_hour','rain_6hour','city']
            data = data[data.temp != -9999]
            data.city = data.city.replace('bos',1)
            data.city = data.city.replace('bal',2)
            data.city = data.city.replace('chi',3)
            data.city = data.city.replace('nyc',4)
            data.city = data.city.replace('phi',5)
            return np.array(data[['month','day','hour','city']])
        else:
#             print X
            data = X.split()
            if data[12] == 'bos':
                city = 1
            elif data[12] == 'bal':
                city = 2
            elif data[12] == 'chi':
                city = 3
            elif data[12] == 'nyc':
                city = 4
            elif data[12] == 'phi':
                city=5
            return np.array([int(data[1]),int(data[2]),int(data[3]),city])

# Questions

For each question, build a model to predict the temperature in a given city at a given time.  You will be given a list of records, each a string in the same format as the lines in the training file.  Return a list of predicted temperatures, one for each incoming record.  (As you can imagine, the temperature values will be stripped out in the actual text records.)

## month_hour_model
Seasonal features are nice because they are relatively safe to extrapolate into the future. There are two ways to handle seasonality.  

The simplest (and perhaps most robust) is to have a set of indicator variables. That is, make the assumption that the temperature at any given time is a function of only the month of the year and the hour of the day, and use that to predict the temperature value.

**Question**: Should month be a continuous or categorical variable?  (Recall that [one-hot encoding](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) is useful to deal with categorical variables.)

In [121]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_extraction import DictVectorizer
month_hour_pipe = pipeline.Pipeline([
  ('transformer', ColumnSelectTransform()),
  ('estimator', KNeighborsRegressor(n_neighbors=5))
#   ('est', LinearRegression())
  ])
month_hour_pipe.fit('train.txt.gz', data[data.temp != -9999].temp)

  del sys.path[0]


Pipeline(steps=[('transformer', ColumnSelectTransform()), ('estimator', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))])

In [119]:
from sklearn.cross_validation import cross_val_score

# cv_test_error = -cross_val_score(month_hour_pipe, 'train.txt.gz', data.temp,cv=5, scoring='neg_mean_squared_error')
# cv_test_error.mean()
month_hour_pipe.score('train.txt.gz', data[data.temp != -9999].temp)

  del sys.path[0]


0.085968493454737449

In [None]:
import dill
dill.dump(month_hour_pipe, open("month_hour_pipe", 'wb'))

In [44]:
def season_factory():
    #return ... # A single estimator or a pipeline
    return month_hour_pipe

# season_model = GroupbyEstimator('city', season_factory).fit(df, df['temp'])

In [18]:
test_data = [
    u"2000 01 01 00   -11   -72 10197   220    26     4     0     0 bos",
    u"2000 01 01 01    -6   -78 10206   230    26     2     0 -9999 bos",
    u"2000 01 01 02   -17   -78 10211   230    36     0     0 -9999 bos",
    u"2000 01 01 03   -17   -78 10214   230    36     0     0 -9999 bos",
    u"2000 01 01 04   -17   -78 10216   230    36     0     0 -9999 bos",
]

In [81]:
def month_hour_model(line):
    return month_hour_pipe.predict(line)[0]

In [82]:
def month_hour_model_all(lines):
    return [month_hour_model(line) for line in lines]

In [117]:
month_hour_model_all(test_data)



[1.0,
 17.800000000000001,
 -36.600000000000001,
 -24.399999999999999,
 0.20000000000000001]

You will need to write a function that makes predictions from a list of strings.  You can either create a pipeline with a transformer and the `season_model`, or you can write a helper function to convert the lines to the format you expect.

In [43]:
y = [0] * 10
y

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [85]:
# grader.score('ts__month_hour_model', lambda x: [0] * len(x))

In [None]:
# grader.score('ts__month_hour_model', month_hour_model_all)



## fourier_model
Since we know that temperature is roughly sinusoidal, we know that a reasonable model might be

$$ y_t = y_0 \sin\left(2\pi\frac{t - t_0}{T}\right) + \epsilon $$

where $k$ and $t_0$ are parameters to be learned and $T$ is one year for seasonal variation.  While this is linear in $y_0$, it is not linear in $t_0$. However, we know from Fourier analysis, that the above is
equivalent to

$$ y_t = A \sin\left(2\pi\frac{t}{T}\right) + B \cos\left(2\pi\frac{t}{T}\right) + \epsilon $$

which is linear in $A$ and $B$.

Create a model containing sinusoidal terms on one or more time scales, and fit it to the data using a linear regression.

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.linear_model
import sklearn.metrics
# import statsmodels.api as sm
from scipy import fftpack
#create a column with "true" (future) values
period_month = 20
prediction_lag = 3 * period_month

cut_year = 2008
oil = pd.read_csv("train.txt.gz")

In [98]:
data = pd.read_csv('train.txt.gz',sep=r"\s*",header=None)
data.columns = ['year','month','day','hour','temp','dew_temp','pressure','wind_angle','wind_speed','sky_code','rain_hour','rain_6hour','city']

  """Entry point for launching an IPython kernel.


In [99]:
data.head()

Unnamed: 0,year,month,day,hour,temp,dew_temp,pressure,wind_angle,wind_speed,sky_code,rain_hour,rain_6hour,city
0,2000,1,1,0,-11,-72,10197,220,26,4,0,0,bos
1,2000,1,1,1,-6,-78,10206,230,26,2,0,-9999,bos
2,2000,1,1,2,-17,-78,10211,230,36,0,0,-9999,bos
3,2000,1,1,3,-17,-78,10214,230,36,0,0,-9999,bos
4,2000,1,1,4,-17,-78,10216,230,36,0,0,-9999,bos


In [101]:
# oil['Julian'] = oil.index.to_julian_date()
# oil = sm.add_constant(oil)
# train = oil[oil.index.year < cut_year].dropna(how="any")
# train.head()
# train = oil

In [None]:
grader.score('ts__fourier_model', month_hour_model_all)# lambda x: [0] * len(x))



*Copyright &copy; 2016 The Data Incubator.  All rights reserved.*