In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
root = 'data/'
train_df = pd.read_csv(root + 'train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

weather_train_df = pd.read_csv(root + 'weather_train.csv')
weather_train_df["timestamp"] = pd.to_datetime(weather_train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

# test_df = pd.read_csv(root + 'test.csv')
# test_df["hour"] = pd.to_datetime(test_df["timestamp"], format='%Y-%m-%d %H:%M:%S').dt.hour

# weather_test_df = pd.read_csv(root + 'weather_test.csv')
building_meta_df = pd.read_csv(root + 'building_metadata.csv')
sample_submission = pd.read_csv(root + 'sample_submission.csv')

weather_test_df = pd.read_csv(root + 'weather_test.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
weather_test_df = reduce_mem_usage(weather_test_df)
weather_test_df.isnull().sum() / len(weather_test_df)

Mem. usage decreased to  6.08 Mb (68.1% reduction)


site_id               0.000000
timestamp             0.000000
air_temperature       0.000375
cloud_coverage        0.506588
dew_temperature       0.001179
precip_depth_1_hr     0.344781
sea_level_pressure    0.076702
wind_direction        0.044618
wind_speed            0.001659
dtype: float64

In [5]:
train_df = reduce_mem_usage(train_df)
weather_train_df = reduce_mem_usage(weather_train_df)
building_meta_df = reduce_mem_usage(building_meta_df)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


In [6]:
train_df.columns.values

array(['building_id', 'meter', 'timestamp', 'meter_reading'], dtype=object)

In [7]:
building_meta_df.columns.values

array(['site_id', 'building_id', 'primary_use', 'square_feet',
       'year_built', 'floor_count'], dtype=object)

In [8]:
weather_train_df.isnull().sum() / len(weather_train_df)

site_id               0.000000
timestamp             0.000000
air_temperature       0.000393
cloud_coverage        0.494895
dew_temperature       0.000808
precip_depth_1_hr     0.359791
sea_level_pressure    0.075966
wind_direction        0.044844
wind_speed            0.002175
dtype: float64

In [9]:
# let us select only office buildings with meter 0 measurements
b_data = pd.merge(train_df, building_meta_df, on='building_id')
b_0 = b_data.query('building_id==390 and meter==0')
print("rows:", len(b_0))
b_0.head()

rows: 8782


Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
4988920,390,0,2016-01-01 00:00:00,56.84,3,Office,34400,,
4988921,390,0,2016-01-01 01:00:00,58.549999,3,Office,34400,,
4988922,390,0,2016-01-01 02:00:00,58.290001,3,Office,34400,,
4988923,390,0,2016-01-01 03:00:00,58.810001,3,Office,34400,,
4988924,390,0,2016-01-01 04:00:00,59.099998,3,Office,34400,,


In [10]:
# now let us add weather

# let us merge it with weather for this site
b = b_0.drop(columns=['meter', 'primary_use', 'year_built', 'floor_count'])
b = pd.merge(b, weather_train_df, on=['site_id','timestamp'])
b['timestamp'] = pd.to_datetime(b["timestamp"], format='%Y-%m-%d %H:%M:%S')
print("records:", len(b))

b = b.drop(columns=['cloud_coverage', 'precip_depth_1_hr', 'sea_level_pressure', 
                    'wind_direction', 'wind_speed'])
                    
b = b.dropna()
print("records after dropping NaN:", len(b))

# b = b[b['square_feet'] > 0]
b['Y'] = b['meter_reading'] / b['square_feet']
b['workhour'] = np.abs(b['timestamp'].dt.hour - 12) < 5.5
b['workday'] = b['timestamp'].dt.weekday < 5

print("rows in dataset:", len(b))
# b = b.query('timestamp == "2016-03-12 14:00:00"')
b.head()

records: 8778
records after dropping NaN: 8772
rows in dataset: 8772


Unnamed: 0,building_id,timestamp,meter_reading,site_id,square_feet,air_temperature,dew_temperature,Y,workhour,workday
0,390,2016-01-01 00:00:00,56.84,3,34400,10.0,2.199219,0.001652,False,True
1,390,2016-01-01 01:00:00,58.549999,3,34400,9.398438,2.800781,0.001702,False,True
2,390,2016-01-01 02:00:00,58.290001,3,34400,8.898438,2.199219,0.001694,False,True
3,390,2016-01-01 03:00:00,58.810001,3,34400,7.800781,1.099609,0.00171,False,True
4,390,2016-01-01 04:00:00,59.099998,3,34400,7.800781,0.600098,0.001718,False,True


In [11]:
# zero values for meter_readings looks abnormally

bld = b.drop(columns=['meter_reading', 'site_id', 'timestamp', 'square_feet'])
bld = bld.astype({'building_id':int, 'workhour': int, 'workday': int})
bld.head()



Unnamed: 0,building_id,air_temperature,dew_temperature,Y,workhour,workday
0,390,10.0,2.199219,0.001652,0,1
1,390,9.398438,2.800781,0.001702,0,1
2,390,8.898438,2.199219,0.001694,0,1
3,390,7.800781,1.099609,0.00171,0,1
4,390,7.800781,0.600098,0.001718,0,1


In [None]:
bld.values

In [None]:
def b_id_to_vec(b_id):
    vec = np.zeros(1449)
    vec[int(b_id)] = 1
    return vec

def to_XY(matrix):
    X = np.zeros((len(matrix), 1453))
    Y = np.zeros(len(matrix))
    for i in range(len(matrix)):
        b, t1, t2, y, h, d = matrix[i]
        X[i] = np.hstack([b_id_to_vec(b), t1/50, t2/50, h, d])
        Y[i] = y
    return (X,Y)
    

In [16]:
Y = bld.values[:,3]
b_x = bld.drop(columns=['building_id', 'Y'])
X = b_x.values / np.array([50, 10, 1, 1])

In [17]:
model = Sequential()
model.add(Dense(6, input_dim=4, activation="relu"))
model.add(Dense(1, activation="relu"))

Instructions for updating:
Colocations handled automatically by placer.


In [18]:
model.compile(loss="mean_squared_error", optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X, Y, epochs=100, batch_size=64)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100


In [None]:
g = nn.predict(X)
plt.scatter(X[:,0],g)
plt.show()

In [None]:
nn.coefs_