In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [32]:
sf_data = pd.read_csv("./raw_data/sanfrancisco.csv")
sf_data = sf_data[-51:]

In [33]:
raw_columns = list(sf_data)

In [34]:
unused_columns = ['snow_1h', 'snow_24h', 'rain_24h', 'rain_1h', 'rain_today', 'snow_today', 'weather_icon', 'weather_id', 'condition_id', 'sea_level', 'grnd_level', 'lat', 'lon', 'city_id', 'city_name']

In [35]:
used_columns = columns = list(set(raw_columns) - set(unused_columns))

In [36]:
weather_description_columns = [
            'light intensity drizzle',
             'thunderstorm',
             'scattered clouds',
             'overcast clouds',
             'smoke',
             'mist',
             'heavy intensity drizzle',
             'moderate rain',
             'thunderstorm with heavy rain',
             'Sky is Clear',
             'very heavy rain',
             'light intensity shower rain',
             'broken clouds',
             'thunderstorm with light drizzle',
             'drizzle',
             'proximity thunderstorm with rain',
             'shower rain',
             'proximity thunderstorm with drizzle',
             'SQUALLS',
             'heavy snow',
             'haze',
             'heavy intensity rain',
             'few clouds',
             'thunderstorm with rain',
             'proximity thunderstorm',
             'light rain',
             'sky is clear',
             'light snow',
             'thunderstorm with light rain',
             'proximity shower rain',
             'fog'
        ]

In [37]:
weather_main_columns = [
            'Thunderstorm',
             'Squall',
             'Snow',
             'Mist',
             'Rain',
             'Smoke',
             'Drizzle',
             'Clear',
             'Clouds',
             'Haze',
             'Fog'
         ]

In [38]:
def cleanup(raw_data, used_columns):
    data = raw_data.loc[:, used_columns] 
    print("fill_na")
    data['rain_3h'] = data['rain_3h'].fillna(0)
    data['snow_3h'] = data['snow_3h'].fillna(0)
    print("drop_duplicates")
    data.drop_duplicates('dt', inplace=True)
    print("add_new_dada")
    data = add_new_data(data)
    
    data = data.apply(transform_datetime, axis=1)
    
    unused_columns  = ['dt_iso', 'condition', 'condition_details', 'dt_datetime']
    
    data = data.drop(unused_columns, axis=1)
    data = data.reset_index(drop=True)
    return data

def add_new_data(data):
    data['dt_datetime'] =  pd.to_datetime(data['dt_iso'], format='%Y-%m-%d %H:%M:%S +%f %Z')
    weather_description_columns = list(set(data['condition_details']))
    weather_main_columns = list(set(data['condition']))
    data = transform_categorical_data(data, weather_description_columns,weather_main_columns)

    return data

def transform_categorical_data(data, weather_description_columns, weather_main_columns):
    for column in weather_description_columns:
        data[column] = data['condition_details'] == column
        data[column] = data[column].astype(int)
        
    for column in weather_main_columns:
        data[column] = data['condition'] == column
        data[column] = data[column].astype(int)
    return data
    
def transform_datetime(current_data):
    for month in range(1, 12):
        current_data['month_{}'.format(month)] = 1 if current_data['dt_datetime'].month == month else 0

    current_data['year'] =  current_data['dt_datetime'].year
    current_data['dayofweek'] = current_data['dt_datetime'].dayofweek
    current_data['dayofyear'] = current_data['dt_datetime'].dayofyear
    current_data['hourofday'] = current_data['dt_datetime'].hour
    return current_data

In [39]:
sf_data2 = cleanup(sf_data, used_columns)


fill_na
drop_duplicates
add_new_dada


In [40]:
sf_data2[48:-1]

Unnamed: 0,clouds_all,pressure,temperature,wind_degree,id,temperature_min,humidity,dt,snow_3h,rain_3h,...,month_6,month_7,month_8,month_9,month_10,month_11,year,dayofweek,dayofyear,hourofday
48,1,1025,290.53,190,38414,289.15,44,1512860400,0.0,0.0,...,0,0,0,0,0,0,2017,5,343,23


In [41]:
def add_target_data(data):
    data['target_temperature'] = data['temperature'][1:].append(pd.Series([np.nan]) , ignore_index=True)
    return data

In [42]:
sf_data3 = add_target_data(sf_data2)

In [43]:
# def add_previous_datum(raw_data):
#     data = raw_data.copy()
#     diff_columns = list(set(raw_data.columns) - set(['dt', 'dt_iso', 'dt_datetime']))
#     maximum_prev = 2 *  24
#     data = data.apply(add_previous_data, args=(diff_columns, data, maximum_prev), axis=1)
#     return data

# def add_previous_data(current_data,  diff_columns, raw_data, maximum_prev):
#     index = current_data.name
#     print(index)
#     if index == 0:
#         return current_data
#     prev_right = index - 1
#     prev_left = index - maximum_prev if index - maximum_prev > 0 else 0
       
#     while prev_left <= prev_right:
#         current_data = add_diff_data(current_data, raw_data.iloc[prev_left], maximum_prev, diff_columns)
#         prev_left += 1
#     return current_data

# def add_diff_data(current_data, prev_data, maximum_prev, diff_columns):
#     diff = int(pd.Timedelta(current_data['dt_datetime'] - prev_data['dt_datetime']).seconds/ 3600)
#     if diff > 0 and diff < maximum_prev:
#         for diff_column in diff_columns:
#             column_name = '{}_{}_ago'.format(diff_column, diff)
#             current_data[column_name] = prev_data[diff_column]
#     return current_data

# # add new data by merging np array and adding dummy data 
# #[NALL, NALL, data1, data2]

In [44]:
hour_diffs =  [1,2,3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]

In [45]:
def add_diff_data(raw_data):
    data = raw_data.copy()
    diff_columns = list(set(raw_data.columns) - set(['dt', 'dt_iso', 'dt_datetime', 'target_temperature', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'year', 'dayofyear', 'dayofweek', 'hourofday', 'temperature_min', 'temperature_max'] + weather_description_columns))
    maximum_prev = 2 *  24
    for i in hour_diffs:
        for column in diff_columns:
            data['{}_{}_ago'.format(column, i)] = pd.Series(np.repeat(np.nan, i)).append(data[column][:-i] , ignore_index=True)
    return data

In [46]:
sf_data4 = add_diff_data(sf_data3)

In [47]:
sf_data3

Unnamed: 0,clouds_all,pressure,temperature,wind_degree,id,temperature_min,humidity,dt,snow_3h,rain_3h,...,month_7,month_8,month_9,month_10,month_11,year,dayofweek,dayofyear,hourofday,target_temperature
0,1,1026,290.69,320,38365,288.15,28,1512687600,0.0,0.0,...,0,0,0,0,0,2017,3,341,23,290.66
1,1,1027,290.66,300,38366,289.15,55,1512691200,0.0,0.0,...,0,0,0,0,0,2017,4,342,0,289.59
2,1,1027,289.59,50,38367,287.15,67,1512694800,0.0,0.0,...,0,0,0,0,0,2017,4,342,1,286.18
3,40,1027,286.18,60,38368,282.15,71,1512698400,0.0,0.0,...,0,0,0,0,0,2017,4,342,2,284.4
4,90,1027,284.4,60,38369,280.15,84,1512702000,0.0,0.0,...,0,0,0,0,0,2017,4,342,3,283.76
5,90,1027,283.76,60,38370,279.15,84,1512705600,0.0,0.0,...,0,0,0,0,0,2017,4,342,4,282.81
6,90,1028,282.81,79,38371,279.15,46,1512709200,0.0,0.0,...,0,0,0,0,0,2017,4,342,5,282.25
7,90,1028,282.25,340,38372,279.15,49,1512712800,0.0,0.0,...,0,0,0,0,0,2017,4,342,6,280.89
8,1,1030,280.89,79,38373,276.15,70,1512716400,0.0,0.0,...,0,0,0,0,0,2017,4,342,7,280.06
9,90,1029,280.06,50,38374,276.15,52,1512720000,0.0,0.0,...,0,0,0,0,0,2017,4,342,8,278.99


In [48]:
def remove_diff_null_data(raw_data):
    data = raw_data[48:-1]
    return data

In [49]:
sf_data5 = remove_diff_null_data(sf_data4)

In [50]:
sf_data5['target_temperature'].tail()

48    290.15
Name: target_temperature, dtype: float64

In [51]:
sf_data5['year'].value_counts()

2017    1
Name: year, dtype: int64

In [52]:
corr_data = sf_data5.corr()

In [53]:
corr_data['target_temperature'].sort_values(ascending=False)

clouds_all           NaN
pressure             NaN
temperature          NaN
wind_degree          NaN
id                   NaN
temperature_min      NaN
humidity             NaN
dt                   NaN
snow_3h              NaN
rain_3h              NaN
temperature_max      NaN
wind_speed           NaN
sky is clear         NaN
mist                 NaN
broken clouds        NaN
scattered clouds     NaN
haze                 NaN
overcast clouds      NaN
fog                  NaN
Clouds               NaN
Haze                 NaN
Mist                 NaN
Clear                NaN
Fog                  NaN
month_1              NaN
month_2              NaN
month_3              NaN
month_4              NaN
month_5              NaN
month_6              NaN
                      ..
id_42_ago            NaN
wind_speed_42_ago    NaN
Clouds_45_ago        NaN
Haze_45_ago          NaN
clouds_all_45_ago    NaN
pressure_45_ago      NaN
Mist_45_ago          NaN
temperature_45_ago   NaN
wind_degree_45_ago   NaN


In [54]:
sf_data5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 48 to 48
Columns: 292 entries, clouds_all to wind_speed_48_ago
dtypes: float64(258), int64(34)
memory usage: 2.4 KB


In [65]:
y_label = sf_data5['target_temperature']
sf_data6 = sf_data5.drop(['target_temperature'], axis=1)

In [69]:
np.any(np.isfinite(sf_data6))

48    0
dtype: int64

## Modeling

In [57]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):

        return X[self.attribute_names].values

In [58]:
#weather_main_columns = list(set(weather_main_columns) - set(['Snow', 'Smoke', 'Squall']))
category_columns = []
for i in range(1, 12):
    category_columns.append('month_{}'.format(i))
for i in hour_diffs:
    for column in set(weather_main_columns):
        category_columns.append('{}_{}_ago'.format(column, i))
category_columns += weather_main_columns
numercial_columns = list(set(sf_data6) - set(category_columns))

In [59]:
backfilled_columns = list(set(category_columns) - set(sf_data6))
for column in backfilled_columns:
    sf_data6[column] = 0

In [60]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(category_columns))
])

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(numercial_columns)),
    ('scalar', StandardScaler())
])

In [61]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
    ('pca', PCA())
    
])



In [62]:
sf_data6

Unnamed: 0,clouds_all,pressure,temperature,wind_degree,id,temperature_min,humidity,dt,snow_3h,rain_3h,...,Rain_2_ago,Squall_6_ago,Smoke_3_ago,Squall_12_ago,Rain_1_ago,Smoke_1_ago,Rain_24_ago,Drizzle,Thunderstorm_12_ago,Snow_42_ago
48,1,1025,290.53,190,38414,289.15,44,1512860400,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
sf_train = full_pipeline.fit_transform(sf_data6)

  explained_variance_ratio_ = explained_variance_ / total_var


In [64]:
len(sf_train[0])
sf_train

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
sgd_clf = SGDRegressor(random_state=42, eta0=0.01, alpha=0.001, penalty='l1')
sgd_clf.partial_fit(sf_train, y_label)

In [None]:
sgd_clf.predict(sf_train[-1])

In [None]:
y_label[-1:]
