In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [2]:
sf_data = pd.read_csv("./raw_data/sanfrancisco.csv")

In [3]:
raw_columns = list(sf_data)

In [160]:
unused_columns = ['snow_1h', 'snow_24h', 'rain_24h', 'rain_1h', 'rain_today', 'snow_today', 'weather_icon', 'weather_id', 'condition_id', 'sea_level', 'grnd_level', 'lat', 'lon', 'city_id', 'city_name']

In [164]:
used_columns = columns = list(set(raw_columns) - set(unused_columns))

In [190]:
def cleanup(raw_data, used_columns):
    data = raw_data.loc[:, used_columns] 
    print("fill_na")
    data['rain_3h'] = data['rain_3h'].fillna(0)
    data['snow_3h'] = data['snow_3h'].fillna(0)
    print("drop_duplicates")
    data.drop_duplicates('dt', inplace=True)
    print("add_new_dada")
    data = add_new_data(data)
    
    data = data.apply(transform_datetime, axis=1)
    
    unused_columns  = ['dt_iso', 'condition', 'condition_details', 'dt_datetime']
    
    data = data.drop(unused_columns, axis=1)
    data = data.reset_index(drop=True)
    return data

def add_new_data(data):
    data['dt_datetime'] =  pd.to_datetime(data['dt_iso'], format='%Y-%m-%d %H:%M:%S +%f %Z')
    weather_description_columns = list(set(data['condition_details']))
    weather_main_columns = list(set(data['condition']))
    data = transform_categorical_data(data, weather_description_columns,weather_main_columns)

    return data

def transform_categorical_data(data, weather_description_columns, weather_main_columns):
    for column in weather_description_columns:
        data[column] = data['condition_details'] == column
        data[column] = data[column].astype(int)
        
    for column in weather_main_columns:
        data[column] = data['condition'] == column
        data[column] = data[column].astype(int)
    return data
    
def transform_datetime(current_data):
    for month in range(1, 12):
        current_data['month_{}'.format(month)] = 1 if current_data['dt_datetime'].month == month else 0

    current_data['year'] =  current_data['dt_datetime'].year
    current_data['dayofweek'] = current_data['dt_datetime'].dayofweek
    current_data['dayofyear'] = current_data['dt_datetime'].dayofyear
    current_data['hourofday'] = current_data['dt_datetime'].hour
    return current_data

In [166]:
sf_data2 = cleanup(sf_data[1:], used_columns)

fill_na
drop_duplicates
add_new_dada


In [167]:
def add_target_data(data):
    data['target_temperature'] = data['temperature'][1:].append(pd.Series([np.nan]) , ignore_index=True)
    return data

In [168]:
sf_data3 = add_target_data(sf_data2)

In [169]:
# def add_previous_datum(raw_data):
#     data = raw_data.copy()
#     diff_columns = list(set(raw_data.columns) - set(['dt', 'dt_iso', 'dt_datetime']))
#     maximum_prev = 2 *  24
#     data = data.apply(add_previous_data, args=(diff_columns, data, maximum_prev), axis=1)
#     return data

# def add_previous_data(current_data,  diff_columns, raw_data, maximum_prev):
#     index = current_data.name
#     print(index)
#     if index == 0:
#         return current_data
#     prev_right = index - 1
#     prev_left = index - maximum_prev if index - maximum_prev > 0 else 0
       
#     while prev_left <= prev_right:
#         current_data = add_diff_data(current_data, raw_data.iloc[prev_left], maximum_prev, diff_columns)
#         prev_left += 1
#     return current_data

# def add_diff_data(current_data, prev_data, maximum_prev, diff_columns):
#     diff = int(pd.Timedelta(current_data['dt_datetime'] - prev_data['dt_datetime']).seconds/ 3600)
#     if diff > 0 and diff < maximum_prev:
#         for diff_column in diff_columns:
#             column_name = '{}_{}_ago'.format(diff_column, diff)
#             current_data[column_name] = prev_data[diff_column]
#     return current_data

# # add new data by merging np array and adding dummy data 
# #[NALL, NALL, data1, data2]

In [170]:
hour_diffs =  [1,2,3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]

In [171]:
def add_diff_data(raw_data):
    data = raw_data.copy()
    diff_columns = list(set(raw_data.columns) - set(['dt', 'dt_iso', 'dt_datetime', 'target_temperature', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'year', 'dayofyear', 'dayofweek', 'hourofday', 'temperature_min', 'temperature_max'] + weather_description_columns))
    maximum_prev = 2 *  24
    for i in hour_diffs:
        for column in diff_columns:
            data['{}_{}_ago'.format(column, i)] = pd.Series(np.repeat(np.nan, i)).append(data[column][:-i] , ignore_index=True)
    return data

In [172]:
sf_data4 = add_diff_data(sf_data3)

In [173]:
def remove_diff_null_data(raw_data):
    data = raw_data[48:-1]
    return data

In [174]:
sf_data5 = remove_diff_null_data(sf_data4)

In [175]:
sf_data5['target_temperature'].tail()

922    284.82
923    283.65
924    282.17
925    286.47
926    286.47
Name: target_temperature, dtype: float64

In [176]:
sf_data5['year'].value_counts()

2012    879
Name: year, dtype: int64

In [177]:
corr_data = sf_data5.corr()

In [178]:
corr_data['target_temperature'].sort_values(ascending=False)

target_temperature    1.000000
temperature           0.953572
temperature_min       0.893503
temperature_1_ago     0.867928
temperature_max       0.842872
temperature_2_ago     0.754620
temperature_21_ago    0.730963
temperature_24_ago    0.705506
temperature_3_ago     0.618969
temperature_45_ago    0.517360
temperature_18_ago    0.465166
temperature_48_ago    0.427962
temperature_27_ago    0.388665
temperature_42_ago    0.352472
hourofday             0.345891
humidity_9_ago        0.344411
humidity_12_ago       0.304721
Clear_12_ago          0.288914
Clear_15_ago          0.263123
humidity_33_ago       0.245208
wind_speed_42_ago     0.244765
wind_degree_42_ago    0.240296
wind_speed_21_ago     0.240253
Clear_9_ago           0.237604
wind_speed_45_ago     0.229842
wind_speed_18_ago     0.220732
temperature_6_ago     0.206740
Clear_36_ago          0.191657
wind_degree_18_ago    0.188099
humidity_6_ago        0.187106
                        ...   
snow_3h_6_ago              NaN
rain_3h_

In [179]:
sf_data5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 48 to 926
Columns: 359 entries, pressure to wind_degree_48_ago
dtypes: float64(312), int64(47)
memory usage: 2.4 MB


In [180]:
y_label = sf_data5['target_temperature']
sf_data6 = sf_data5.drop(['target_temperature'], axis=1)

## Modeling

In [181]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):

        return X[self.attribute_names].values

In [195]:
#weather_main_columns = list(set(weather_main_columns) - set(['Snow', 'Smoke', 'Squall']))
category_columns = []
for i in range(1, 12):
    category_columns.append('month_{}'.format(i))
for i in hour_diffs:
    for column in set(weather_main_columns):
        category_columns.append('{}_{}_ago'.format(column, i))
category_columns += weather_main_columns
numercial_columns = list(set(sf_data6) - set(category_columns))

In [197]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(category_columns))
])

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(numercial_columns)),
    ('scalar', StandardScaler())
])

In [202]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
    #('pca', PCA())
    
])



In [203]:
sf_train = full_pipeline.fit_transform(sf_data6)

In [214]:
len(sf_train[0])
sf_train

array([[ -1.32057515e+00,   7.68536019e-01,  -8.56203799e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -1.32057515e+00,   7.68536019e-01,   1.46419533e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -1.32057515e+00,  -1.69009196e+00,  -1.31990849e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  1.39598360e+00,   7.68536019e-01,  -4.89824989e-01, ...,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  1.39598360e+00,   8.56344161e-01,   5.97432265e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -1.04891927e+00,   7.68536019e-01,   3.65058902e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [218]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
sgd_clf = SGDRegressor(random_state=42, eta0=0.01, alpha=0.001, penalty='l1')
sgd_clf.partial_fit(sf_train, y_label)

SGDRegressor(alpha=0.001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l1', power_t=0.25,
       random_state=42, shuffle=True, verbose=0, warm_start=False)

In [219]:
sgd_clf.predict(sf_train[-1])



array([ 270.25062764])

In [222]:
y_label[-1:]


926    286.47
Name: target_temperature, dtype: float64