In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [None]:
root = 'data/'
train_df = pd.read_csv(root + 'train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

weather_train_df = pd.read_csv(root + 'weather_train.csv')
weather_train_df["timestamp"] = pd.to_datetime(weather_train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

# test_df = pd.read_csv(root + 'test.csv')
# test_df["hour"] = pd.to_datetime(test_df["timestamp"], format='%Y-%m-%d %H:%M:%S').dt.hour

# weather_test_df = pd.read_csv(root + 'weather_test.csv')
building_meta_df = pd.read_csv(root + 'building_metadata.csv')
sample_submission = pd.read_csv(root + 'sample_submission.csv')

weather_test_df = pd.read_csv(root + 'weather_test.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
weather_test_df = reduce_mem_usage(weather_test_df)
weather_test_df.isnull().sum() / len(weather_test_df)

In [None]:
train_df = reduce_mem_usage(train_df)
weather_train_df = reduce_mem_usage(weather_train_df)
building_meta_df = reduce_mem_usage(building_meta_df)

In [None]:
train_df.columns.values

In [None]:
building_meta_df.columns.values

In [None]:
weather_train_df.isnull().sum() / len(weather_train_df)

In [None]:
# let us select only office buildings with meter 0 measurements
b_data = pd.merge(train_df, building_meta_df, on='building_id')
b_0 = b_data.query('meter==0')
print("rows:", len(b_0))
b_0.head()

In [None]:
# now let us add weather

# let us merge it with weather for this site
b = b_0.drop(columns=['meter', 'primary_use', 'year_built', 'floor_count'])
b = pd.merge(b, weather_train_df, on=['site_id','timestamp'])
b['timestamp'] = pd.to_datetime(b["timestamp"], format='%Y-%m-%d %H:%M:%S')
print("records:", len(b))

b = b.drop(columns=['cloud_coverage', 'precip_depth_1_hr', 'sea_level_pressure', 
                    'wind_direction', 'wind_speed'])
                    
b = b.dropna()
print("records after dropping NaN:", len(b))

# b = b[b['square_feet'] > 0]
b['Y'] = b['meter_reading'] / b['square_feet']
b['workhour'] = np.abs(b['timestamp'].dt.hour - 12) < 5.5
b['workday'] = b['timestamp'].dt.weekday < 5

print("rows in dataset:", len(b))
# b = b.query('timestamp == "2016-03-12 14:00:00"')
b.head()

In [None]:
# zero values for meter_readings looks abnormally

bld = b.drop(columns=['meter_reading', 'site_id', 'timestamp', 'square_feet'])
bld = bld.astype({'building_id':int, 'workhour': int, 'workday': int})
bld.head()



In [None]:
bld.values

In [None]:
def b_id_to_vec(b_id):
    vec = np.zeros(1449)
    vec[int(b_id)] = 1
    return vec

def to_XY(matrix):
    X = np.zeros((len(matrix), 1453))
    Y = np.zeros(len(matrix))
    for i in range(len(matrix)):
        b, t1, t2, y, h, d = matrix[i]
        X[i] = np.hstack([b_id_to_vec(b), t1/50, t2/50, h, d])
        Y[i] = y
    return (X,Y)
    

In [None]:
X,Y = to_XY(bld.values[:100,:])

In [None]:

X

In [None]:
b.head()

In [None]:
b_15 = b[b.building_id==15]
b_15.describe()

In [None]:
X = b_15[['air_temperature', 'dew_temperature', 'workhour', 'workday']].values
y = 1000*b_15['Y'].values
plt.scatter(X[:,0],y)
plt.show()

In [None]:
nn = MLPRegressor(hidden_layer_sizes=(6,), learning_rate_init=0.02, alpha=0.05,
                 learning_rate='adaptive')
nn.fit(X,y)
nn.score(X,y)

In [None]:
g = nn.predict(X)
plt.scatter(X[:,0],g)
plt.show()

In [None]:
nn.coefs_

In [None]:
import keras