# Prep

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#df = pd.read_csv('train.csv')
#building_metadata = pd.read_csv('building_metadata.csv')
#weather_df = pd.read_csv('weather_train.csv')

df = pd.read_csv('data/train.csv')
building_metadata = pd.read_csv('data/building_metadata.csv')
weather_df = pd.read_csv('data/weather_train.csv')

In [None]:
training_joined = df.merge(building_metadata, how='left', on='building_id')
final_df = training_joined.merge(weather_df, how='left', on=['site_id', 'timestamp'])

train, test = train_test_split(final_df, test_size=0.3, random_state=21)

In [None]:
# Time
train['timestamp'] = pd.to_datetime(train['timestamp'])
train['hour'] = train['timestamp'].dt.hour
train['month']= train['timestamp'].dt.month
train['date'] = train['timestamp'].dt.date

In [None]:
#NaN filling for better detecting
train['cloud_filled'] = train['cloud_coverage'].fillna(-10)
train['rain_filled'] = train['precip_depth_1_hr'].fillna(-100)
train['temp_filled'] = train['air_temperature'].fillna(-10)
train['wind_filled'] = train['wind_speed'].fillna(-10)

# NaN count

In [None]:
count=0
while count < 16:
    w = train[train['site_id']==count]
    print('site ' + str(count))
    print('Temp NaN: ', w['air_temperature'].isna().sum())
    print('Cloud NaN: ', w['cloud_coverage'].isna().sum())
    print('Dew NaN: ', w['dew_temperature'].isna().sum())
    print('Rain NaN: ', w['precip_depth_1_hr'].isna().sum())
    print('Pressure NaN: ', w['sea_level_pressure'].isna().sum())
    print('Wind D NaN: ', w['wind_direction'].isna().sum())
    print('Wind S NaN: ', w['wind_speed'].isna().sum(), '\n')
    count += 1

# Comparing plots by site

In [None]:
#Temp
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    # Plot data for the current site_id
    sns.set_theme(font_scale=1.5)
    g=sns.lineplot(x='air_temperature', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
    #g=sns.scatterplot(x='hour', y='temp_filled', data=data_use, ax=axes[i])
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_title(f'{site}', fontsize=25)
    axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("Meter vs temp per site-id", fontsize=44)
plt.show()



In [None]:
#Cloud coverage
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    if not data_use['cloud_coverage'].isnull().all():
        sns.set_theme(font_scale=1.5)
        g=sns.lineplot(x='cloud_coverage', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
        #g=sns.scatterplot(x='date', y='cloud_filled', data=data_use, ax=axes[i])
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].set_title(f'{site}', fontsize=25)
        axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("Meter vs Cloud per site-id", fontsize=44)
plt.show()


In [None]:
#dew Temp
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    # Plot data for the current site_id
    sns.set_theme(font_scale=1.5)
    g=sns.lineplot(x='dew_temperature', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
    #g=sns.scatterplot(x='date', y='dew_temperature', data=data_use, ax=axes[i])
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_title(f'{site}', fontsize=25)
    axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("dew_temperature vs meter per site-id", fontsize=44)
plt.show()

In [None]:
#rain
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    if not data_use['precip_depth_1_hr'].isnull().all():
        sns.set_theme(font_scale=1.5)
        g=sns.lineplot(x='precip_depth_1_hr', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
        #g=sns.scatterplot(x='date', y='precip_depth_1_hr', data=data_use, ax=axes[i])
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].set_title(f'{site}', fontsize=25)
        axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("rain vs meter per site-id", fontsize=44)
plt.show()

In [None]:
#pressure
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    if not data_use['sea_level_pressure'].isnull().all():
        sns.set_theme(font_scale=1.5)
        g=sns.lineplot(x='sea_level_pressure', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
        #g=sns.scatterplot(x='date', y='sea_level_pressure', data=data_use, ax=axes[i])
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].set_title(f'{site}', fontsize=25)
        axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("pressure vs meter per site-id", fontsize=44)
plt.show()

In [None]:
#wind-direction
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    # Plot data for the current site_id
    sns.set_theme(font_scale=1.5)
    g=sns.lineplot(x='wind_direction', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
    #g=sns.scatterplot(x='date', y='wind_direction', data=data_use, ax=axes[i])
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_title(f'{site}', fontsize=25)
    axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("wind_direction vs meter per site-id", fontsize=44)
plt.show()

In [None]:
#wind-speed
unique = train['site_id'].unique()
num_plots = num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(50, 10*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    # Plot data for the current site_id
    sns.set_theme(font_scale=1.5)
    g=sns.lineplot(x='wind_speed', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
    #g=sns.scatterplot(x='date', y='wind_speed', data=data_use, ax=axes[i])
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_title(f'{site}', fontsize=25)
    axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("wind_speed vs meter per site-id", fontsize=44)
plt.show()

In [None]:
#meter-reading
unique = train['site_id'].unique()
num_plots = len(unique)
num_cols = 2
num_rows = 8

fig, axes = plt.subplots(num_rows, num_cols, figsize=(25, 5*num_rows))

if num_plots > 1:
    axes = axes.flatten()

for i, site in enumerate(unique):
    data_use = train[train['site_id'] == site]

    # Plot data for the current site_id
    sns.set_theme(font_scale=1.5)
    g=sns.lineplot(x='date', y='meter_reading', hue='meter', data=data_use, ax=axes[i])
    #g=sns.scatterplot(x='hour', y='meter_reading', data=data_use, ax=axes[i])
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_title(f'{site}', fontsize=25)
    axes[i].grid(True)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle("Time vs Meter_reading per site-id", fontsize=44)
plt.show()

# Comparing plots by meter

In [None]:
m0 = train[train['meter'] == 0]
m1 = train[train['meter'] == 1]
m2 = train[train['meter'] == 2]
m3 = train[train['meter'] == 3]

In [None]:
#Temp
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()


g0=sns.lineplot(x='air_temperature', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Air Temperature')

g1=sns.lineplot(x='air_temperature', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Air Temperature')

g2=sns.lineplot(x='air_temperature', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Air Temperature')

g3=sns.lineplot(x='air_temperature', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Air Temperature')

plt.tight_layout()
plt.show()

In [None]:
#Cloud
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()

g0=sns.lineplot(x='cloud_coverage', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Cloud coverage')

g1=sns.lineplot(x='cloud_coverage', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Cloud coverage')

g2=sns.lineplot(x='cloud_coverage', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Cloud coverage')

g3=sns.lineplot(x='cloud_coverage', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Cloud coverage')

plt.tight_layout()
plt.show()

In [None]:
#dew_temperature
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()

g0=sns.lineplot(x='dew_temperature', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Dew temperature')

g1=sns.lineplot(x='dew_temperature', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Dew temperature')

g2=sns.lineplot(x='dew_temperature', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Dew temperature')

g3=sns.lineplot(x='dew_temperature', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Dew temperature')

plt.tight_layout()
plt.show()

In [None]:
#precip_depth_1_hr
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()

g0=sns.lineplot(x='precip_depth_1_hr', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Precipitation')

g1=sns.lineplot(x='precip_depth_1_hr', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Precipitation')

g2=sns.lineplot(x='precip_depth_1_hr', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Precipitation')

g3=sns.lineplot(x='precip_depth_1_hr', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Precipitation')

plt.tight_layout()
plt.show()

In [None]:
#pressure
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()

g0=sns.lineplot(x='sea_level_pressure', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Sea level pressure')

g1=sns.lineplot(x='sea_level_pressure', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Sea level pressure')

g2=sns.lineplot(x='sea_level_pressure', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Sea level pressure')

g3=sns.lineplot(x='sea_level_pressure', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Sea level pressure')

plt.tight_layout()
plt.show()

In [None]:
#wind-direction
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()

g0=sns.lineplot(x='wind_direction', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Wind direction')

g1=sns.lineplot(x='wind_direction', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Wind direction')

g2=sns.lineplot(x='wind_direction', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Wind direction')

g3=sns.lineplot(x='wind_direction', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Wind direction')

plt.tight_layout()
plt.show()

In [None]:
#wind-speed
fig, axes = plt.subplots(1, 4, figsize=(50, 10))
axes = axes.flatten()

g0=sns.lineplot(x='wind_speed', y='meter_reading', hue='site_id', data=m0, ax=axes[0])
g0.set_title('Distribution Meter 0 Reading vs Wind speed')

g1=sns.lineplot(x='wind_speed', y='meter_reading', hue='site_id', data=m1, ax=axes[1])
g1.set_title('Distribution Meter 1 Reading vs Wind speed')

g2=sns.lineplot(x='wind_speed', y='meter_reading', hue='site_id', data=m2, ax=axes[2])
g2.set_title('Distribution Meter 2 Reading vs Wind speed')

g3=sns.lineplot(x='wind_speed', y='meter_reading', hue='site_id', data=m3, ax=axes[3])
g3.set_title('Distribution Meter 3 Reading vs Wind speed')

plt.tight_layout()
plt.show()

In [None]:
values=['air_temperature', 'dew_temperature', 'precip_depth_1_hr', 'cloud_coverage', 'wind_speed', 'wind_direction', 'sea_level_pressure']
weather_data = train[values]
sns.heatmap(weather_data.corr())

In [None]:
#corr matrix
matrix=train.drop(columns=['building_id', 'meter', 'timestamp',
       'primary_use', 'date'])

matrix.corr()
count=0

while count < 16:
  w = matrix[matrix['site_id']==count]
  print('SITE ' + str(count))
  print(w.corr(), '\n')
  count += 1