In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [2]:
sf_data = pd.read_csv("./raw_data/sanfrancisco.csv")

In [3]:
raw_columns = list(sf_data)

In [4]:
unused_columns = ['snow_1h', 'snow_24h', 'rain_24h', 'rain_1h', 'snow_3h', 'rain_today', 'snow_today', 'weather_icon', 'weather_id', 'sea_level', 'grnd_level', 'lat', 'lon', 'city_id', 'city_name']

In [5]:
used_columns = columns = list(set(raw_columns) - set(unused_columns))

In [6]:
sf_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38415 entries, 0 to 38414
Data columns (total 28 columns):
dt                     38415 non-null int64
dt_iso                 38415 non-null object
city_id                38415 non-null int64
city_name              0 non-null float64
lat                    0 non-null float64
lon                    0 non-null float64
temp                   38415 non-null float64
temp_min               38415 non-null float64
temp_max               38415 non-null float64
pressure               38415 non-null int64
sea_level              0 non-null float64
grnd_level             0 non-null float64
humidity               38415 non-null int64
wind_speed             38415 non-null int64
wind_deg               38415 non-null int64
rain_1h                1866 non-null float64
rain_3h                1355 non-null float64
rain_24h               97 non-null float64
rain_today             137 non-null float64
snow_1h                3 non-null float64
snow_3h        

In [7]:
def cleanup(raw_data, used_columns):
    data = raw_data.loc[:, used_columns] 
    print("fill_na")
    data['rain_3h'] = data['rain_3h'].fillna(0)
    print("drop_duplicates")
    data.drop_duplicates('dt', inplace=True)
    print("add_new_dada")
    data = add_new_data(data)
    
    unused_columns  = ['dt_iso', 'weather_main', 'weather_description']
    
    data = data.drop(unused_columns, axis=1)
    data = data.reset_index(drop=True)
    return data

def add_new_data(data):
    data['dt_datetime'] =  pd.to_datetime(data['dt_iso'], format='%Y-%m-%d %H:%M:%S +%f %Z')
    weather_description_columns = list(set(data['weather_description']))
    weather_main_columns = list(set(data['weather_main']))
    data = transform_categorical_data(data, weather_description_columns,weather_main_columns)

    return data

def transform_categorical_data(data, weather_description_columns, weather_main_columns):
    for column in weather_description_columns:
        data[column] = data['weather_description'] == column
        data[column] = data[column].astype(int)
        
    for column in weather_main_columns:
        data[column] = data['weather_main'] == column
        data[column] = data[column].astype(int)
    return data
    

In [8]:
sf_data2 = cleanup(sf_data, used_columns)


fill_na
drop_duplicates
add_new_dada


In [9]:
sf_data2.head(10)

Unnamed: 0,wind_deg,temp_max,humidity,temp,pressure,clouds_all,temp_min,wind_speed,rain_3h,dt,...,Thunderstorm,Rain,Squall,Smoke,Drizzle,Snow,Haze,Clear,Clouds,Mist
0,150,301.15,88,289.48,1009,0,282.15,2,0.0,1349096400,...,0,0,0,0,1,0,0,0,0,0
1,0,301.15,63,289.13,1015,1,282.15,0,0.0,1349186400,...,0,0,0,0,0,0,0,1,0,0
2,0,302.15,51,290.73,1016,1,283.15,0,0.0,1349190000,...,0,0,0,0,0,0,0,1,0,0
3,190,307.15,56,293.02,1011,0,287.15,4,0.0,1349193600,...,0,0,0,0,0,0,1,0,0,0
4,140,307.15,94,296.18,1001,0,290.15,3,0.0,1349197200,...,0,0,0,0,0,0,1,0,0,0
5,230,305.93,70,299.66,1009,40,293.71,4,0.0,1349200800,...,0,0,0,0,0,0,0,0,1,0
6,170,308.15,83,300.03,1000,90,293.15,6,0.0,1349204400,...,0,0,0,0,0,0,0,0,1,0
7,250,308.71,75,301.07,1007,0,293.15,3,0.0,1349208000,...,0,0,0,0,0,0,1,0,0,0
8,260,308.71,75,302.29,1007,75,295.15,3,0.0,1349211600,...,0,0,0,0,0,0,0,0,1,0
9,300,309.26,18,304.7,1013,1,296.15,6,0.0,1349215200,...,0,0,0,0,0,0,0,1,0,0


In [10]:
def add_previous_datum(raw_data):
    data = raw_data.copy()
    diff_columns = list(set(raw_data.columns) - set(['dt', 'dt_iso', 'dt_datetime']))
    maximum_prev = 2 *  24
    data = data.apply(add_previous_data, args=(diff_columns, data, maximum_prev), axis=1)
    return data

def add_previous_data(current_data,  diff_columns, raw_data, maximum_prev):
    index = current_data.name
    print(index)
    if index == 0:
        return current_data
    prev_right = index - 1
    prev_left = index - maximum_prev if index - maximum_prev > 0 else 0
       
    while prev_left <= prev_right:
        current_data = add_diff_data(current_data, raw_data.iloc[prev_left], maximum_prev, diff_columns)
        prev_left += 1
    return current_data

def add_diff_data(current_data, prev_data, maximum_prev, diff_columns):
    diff = int(pd.Timedelta(current_data['dt_datetime'] - prev_data['dt_datetime']).seconds/ 3600)
    if diff > 0 and diff < maximum_prev:
        for diff_column in diff_columns:
            column_name = '{}_{}_ago'.format(diff_column, diff)
            current_data[column_name] = prev_data[diff_column]
    return current_data

# add new data by merging np array and adding dummy data 
#[NALL, NALL, data1, data2]


In [11]:
pd.Series(np.repeat(np.nan, 4)).append(sf_data2['dt_datetime'][:0], ignore_index=True)


0   NaN
1   NaN
2   NaN
3   NaN
dtype: float64

In [None]:
def add_diff_data2(raw_data):
    data = raw_data.copy()
    diff_columns = list(set(raw_data.columns) - set(['dt', 'dt_iso', 'dt_datetime']))
    maximum_prev = 2 *  24
    for i in range(1, maximum_prev):
        for column in diff_columns:
            print('{}_{}'.format(column, i))
            data['{}_{}_ago'.format(column, i)] = pd.Series(np.repeat(np.nan, i)).append(data[column][:-i] , ignore_index=True)
    return data    

In [None]:
sf_data3 = add_diff_data2(sf_data2)

mist_1
proximity shower rain_1
light rain_1
Squall_1
Smoke_1
heavy intensity drizzle_1
proximity thunderstorm_1
broken clouds_1
few clouds_1
Snow_1
Clear_1
temp_min_1
thunderstorm with light rain_1
moderate rain_1
SQUALLS_1
wind_deg_1
temp_max_1
heavy intensity rain_1
proximity thunderstorm with drizzle_1
Thunderstorm_1
fog_1
temp_1
shower rain_1
pressure_1
thunderstorm with light drizzle_1
proximity thunderstorm with rain_1
rain_3h_1
haze_1
drizzle_1
Fog_1
humidity_1
thunderstorm with heavy rain_1
light snow_1
thunderstorm with rain_1
light intensity shower rain_1
light intensity drizzle_1
Drizzle_1
smoke_1
very heavy rain_1
Haze_1
clouds_all_1
wind_speed_1
Sky is Clear_1
scattered clouds_1
Mist_1
overcast clouds_1
Rain_1
thunderstorm_1
sky is clear_1
heavy snow_1
Clouds_1
mist_2
proximity shower rain_2
light rain_2
Squall_2
Smoke_2
heavy intensity drizzle_2
proximity thunderstorm_2
broken clouds_2
few clouds_2
Snow_2
Clear_2
temp_min_2
thunderstorm with light rain_2
moderate rain_2
S

broken clouds_12
few clouds_12
Snow_12
Clear_12
temp_min_12
thunderstorm with light rain_12
moderate rain_12
SQUALLS_12
wind_deg_12
temp_max_12
heavy intensity rain_12
proximity thunderstorm with drizzle_12
Thunderstorm_12
fog_12
temp_12
shower rain_12
pressure_12
thunderstorm with light drizzle_12
proximity thunderstorm with rain_12
rain_3h_12
haze_12
drizzle_12
Fog_12
humidity_12
thunderstorm with heavy rain_12
light snow_12
thunderstorm with rain_12
light intensity shower rain_12
light intensity drizzle_12
Drizzle_12
smoke_12
very heavy rain_12
Haze_12
clouds_all_12
wind_speed_12
Sky is Clear_12
scattered clouds_12
Mist_12
overcast clouds_12
Rain_12
thunderstorm_12
sky is clear_12
heavy snow_12
Clouds_12
mist_13
proximity shower rain_13
light rain_13
Squall_13
Smoke_13
heavy intensity drizzle_13
proximity thunderstorm_13
broken clouds_13
few clouds_13
Snow_13
Clear_13
temp_min_13
thunderstorm with light rain_13
moderate rain_13
SQUALLS_13
wind_deg_13
temp_max_13
heavy intensity rain

broken clouds_22
few clouds_22
Snow_22
Clear_22
temp_min_22
thunderstorm with light rain_22
moderate rain_22
SQUALLS_22
wind_deg_22
temp_max_22
heavy intensity rain_22
proximity thunderstorm with drizzle_22
Thunderstorm_22
fog_22
temp_22
shower rain_22
pressure_22
thunderstorm with light drizzle_22
proximity thunderstorm with rain_22
rain_3h_22
haze_22
drizzle_22
Fog_22
humidity_22
thunderstorm with heavy rain_22
light snow_22
thunderstorm with rain_22
light intensity shower rain_22
light intensity drizzle_22
Drizzle_22
smoke_22
very heavy rain_22
Haze_22
clouds_all_22
wind_speed_22
Sky is Clear_22
scattered clouds_22
Mist_22
overcast clouds_22
Rain_22
thunderstorm_22
sky is clear_22
heavy snow_22
Clouds_22
mist_23
proximity shower rain_23
light rain_23
Squall_23
Smoke_23
heavy intensity drizzle_23
proximity thunderstorm_23
broken clouds_23
few clouds_23
Snow_23
Clear_23
temp_min_23
thunderstorm with light rain_23
moderate rain_23
SQUALLS_23
wind_deg_23
temp_max_23
heavy intensity rain

Rain_32
thunderstorm_32
sky is clear_32
heavy snow_32
Clouds_32
mist_33
proximity shower rain_33
light rain_33
Squall_33
Smoke_33
heavy intensity drizzle_33
proximity thunderstorm_33
broken clouds_33
few clouds_33
Snow_33
Clear_33
temp_min_33
thunderstorm with light rain_33
moderate rain_33
SQUALLS_33
wind_deg_33
temp_max_33
heavy intensity rain_33
proximity thunderstorm with drizzle_33
Thunderstorm_33
fog_33
temp_33
shower rain_33
pressure_33
thunderstorm with light drizzle_33
proximity thunderstorm with rain_33
rain_3h_33
haze_33
drizzle_33
Fog_33
humidity_33
thunderstorm with heavy rain_33
light snow_33
thunderstorm with rain_33
light intensity shower rain_33
light intensity drizzle_33
Drizzle_33
smoke_33
very heavy rain_33
Haze_33
clouds_all_33
wind_speed_33
Sky is Clear_33
scattered clouds_33
Mist_33
overcast clouds_33
Rain_33
thunderstorm_33
sky is clear_33
heavy snow_33
Clouds_33
mist_34
proximity shower rain_34
light rain_34
Squall_34
Smoke_34
heavy intensity drizzle_34
proximi

In [None]:
def add_datetime(raw_data):
    data = raw_data.copy()
    data = data.apply(transform_datetime)
    data = data.drop(['dt_datetime'], axis=1)
    return data
    
def transform_datetime(current_data):
    for month in range(1, 12):
        current_data['month_{}'.format(month)] = 1 if current_data['dt_datetime'].month == month else 0
       
    current_data['year'] =  current_data['dt_datetime'].year
    current_data['dayofweek'] = current_data['dt_datetime'].dayofweek
    current_data['dayofyear'] = current_data['dt_datetime'].dayofyear
    current_data['hourofday'] = current_data['dt_datetime'].hour
    return current_data

In [None]:
def add_target_datum(raw_data):
    data = raw_data.copy()
    data = data.apply(args=(data))
    return data

def add_target_data(current_data, raw_data):
    index = current_data.name
    last_index = raw_data.index[-1]
    if index == last_index:
        return current_data
    
    current_data['target_temperature'] = raw_data.iloc[index+1]['temparature']
    return current_data

In [None]:
sf_data2.index[-1]