# Reduce Memory Footprint

In [1]:
import gdown
ashrae_url = "https://drive.google.com/uc?id=1UWf5PYpIPv5TJnOE3eVZOekBiryBWobu"
output = "ashrae.zip"
gdown.download(ashrae_url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1UWf5PYpIPv5TJnOE3eVZOekBiryBWobu
To: /content/ashrae.zip
100%|██████████| 397M/397M [00:02<00:00, 164MB/s]


'ashrae.zip'

In [2]:
# Unzip ASHRAE data:
!unzip ashrae.zip
!rm -rf ashrae.zip

Archive:  ashrae.zip
  inflating: building_metadata.csv   
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
  inflating: weather_test.csv        
  inflating: weather_train.csv       


In [3]:
# Load in training data:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
# Find the current data footprint of train dataframe:
train.info()
print()

# Reduce datatype precision to lower memory:
#print(np.max(train["building_id"])) # maximum number is 1448 so we can fit it in int32
#print(np.max(train["meter"])) # to uint8 (range from 1 to 3 lol)
#print(np.max(train["meter_reading"])) # leave as float64
train["building_id"] = train["building_id"].astype(np.int32)
train["meter"] = train["meter"].astype(np.uint8)

# new memory footprint:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   building_id    int64  
 1   meter          int64  
 2   timestamp      object 
 3   meter_reading  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 616.9+ MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   building_id    int32  
 1   meter          uint8  
 2   timestamp      object 
 3   meter_reading  float64
dtypes: float64(1), int32(1), object(1), uint8(1)
memory usage: 404.9+ MB


In [4]:
# Load in testing data:
test = pd.read_csv('test.csv')
# Find the current data footprint of train dataframe:
test.info()
print()

# Reduce datatype precision to lower memory:
#print(np.max(test["row_id"])) # maximum number is 41 mil so we can fit it in int32
#print(np.max(test["building_id"])) # max number is 1448 so int32
#print(np.max(test["meter"])) # to uint8 (range from 1 to 3 lol)
test["row_id"] = test["row_id"].astype(np.int32)
test["building_id"] = test["building_id"].astype(np.int32)
test["meter"] = test["meter"].astype(np.uint8)

# new memory footprint:
test.info() # reduced 1.2 GB to 676 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   row_id       int64 
 1   building_id  int64 
 2   meter        int64 
 3   timestamp    object
dtypes: int64(3), object(1)
memory usage: 1.2+ GB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   row_id       int32 
 1   building_id  int32 
 2   meter        uint8 
 3   timestamp    object
dtypes: int32(2), object(1), uint8(1)
memory usage: 676.0+ MB


In [5]:
weather_train = pd.read_csv('weather_train.csv')
# Find the current data footprint of train dataframe:
weather_train.info()
# no point in reducing 9.6 MB memory footprint...
print()
556
weather_test = pd.read_csv('weather_test.csv')
weather_test.info() # 19 MB memory
print()

meta = pd.read_csv('building_metadata.csv')
meta.info() # KB...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   site_id             139773 non-null  int64  
 1   timestamp           139773 non-null  object 
 2   air_temperature     139718 non-null  float64
 3   cloud_coverage      70600 non-null   float64
 4   dew_temperature     139660 non-null  float64
 5   precip_depth_1_hr   89484 non-null   float64
 6   sea_level_pressure  129155 non-null  float64
 7   wind_direction      133505 non-null  float64
 8   wind_speed          139469 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 9.6+ MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277243 entries, 0 to 277242
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   site_id             277243 non-null  int64  
 1   timestamp   

# Visualization

In [None]:
# TODO...
train

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.000
1,1,0,2016-01-01 00:00:00,0.000
2,2,0,2016-01-01 00:00:00,0.000
3,3,0,2016-01-01 00:00:00,0.000
4,4,0,2016-01-01 00:00:00,0.000
...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750
20216096,1445,0,2016-12-31 23:00:00,4.825
20216097,1446,0,2016-12-31 23:00:00,0.000
20216098,1447,0,2016-12-31 23:00:00,159.575


In [None]:
weather_train

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6
...,...,...,...,...,...,...,...,...,...
139768,15,2016-12-31 19:00:00,3.0,,-8.0,,,180.0,5.7
139769,15,2016-12-31 20:00:00,2.8,2.0,-8.9,,1007.4,180.0,7.7
139770,15,2016-12-31 21:00:00,2.8,,-7.2,,1007.5,180.0,5.1
139771,15,2016-12-31 22:00:00,2.2,,-6.7,,1008.0,170.0,4.6


# Load Pre-Processed Data (Cleaned Up Data)

In [6]:
import gdown
import pandas as pd
import numpy as np

ashrae_url = "https://drive.google.com/uc?id=1UlIpiR3Y6XiSWLhJGuaogrFgYsJns2FN"
output = "cleaned_ashrae.csv"
gdown.download(ashrae_url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1UlIpiR3Y6XiSWLhJGuaogrFgYsJns2FN
To: /content/cleaned_ashrae.csv
100%|██████████| 2.76G/2.76G [00:35<00:00, 77.3MB/s]


'cleaned_ashrae.csv'

In [7]:
clean_train = pd.read_csv('cleaned_ashrae.csv') # with ALL 14 features

# reduce memory:
clean_train['timestamp'] = clean_train['timestamp'].astype(np.int64)
clean_train['meter'] = clean_train['meter'].astype(np.uint8)
clean_train['site_id'] = clean_train['site_id'].astype(np.int64)
clean_train['primary_use'] = clean_train['primary_use'].astype(np.int64)

clean_train = clean_train.drop(columns='floor_count')
clean_train = clean_train.drop(columns='year_built')
clean_train # NOTE: meter_reading is the target variable so is the 14th feature

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,meter_reading
0,0.0,0,1451606400,0,0,7432.0,25.0,6.0,20.0,-1.0,1019.700012,0.0,0.0,0.000
1,1.0,0,1451606400,0,0,2720.0,25.0,6.0,20.0,-1.0,1019.700012,0.0,0.0,0.000
2,2.0,0,1451606400,0,0,5376.0,25.0,6.0,20.0,-1.0,1019.700012,0.0,0.0,0.000
3,3.0,0,1451606400,0,0,23685.0,25.0,6.0,20.0,-1.0,1019.700012,0.0,0.0,0.000
4,4.0,0,1451606400,0,0,116607.0,25.0,6.0,20.0,-1.0,1019.700012,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216095,1444.0,0,1483225200,15,1,19619.0,1.7,-1.0,-5.6,-1.0,1008.500000,180.0,8.8,8.750
20216096,1445.0,0,1483225200,15,0,4298.0,1.7,-1.0,-5.6,-1.0,1008.500000,180.0,8.8,4.825
20216097,1446.0,0,1483225200,15,1,11265.0,1.7,-1.0,-5.6,-1.0,1008.500000,180.0,8.8,0.000
20216098,1447.0,0,1483225200,15,4,29775.0,1.7,-1.0,-5.6,-1.0,1008.500000,180.0,8.8,159.575


In [8]:
clean_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   building_id         float64
 1   meter               uint8  
 2   timestamp           int64  
 3   site_id             int64  
 4   primary_use         int64  
 5   square_feet         float64
 6   air_temperature     float64
 7   cloud_coverage      float64
 8   dew_temperature     float64
 9   precip_depth_1_hr   float64
 10  sea_level_pressure  float64
 11  wind_direction      float64
 12  wind_speed          float64
 13  meter_reading       float64
dtypes: float64(10), int64(3), uint8(1)
memory usage: 2.0 GB


In [9]:
#Taking random 10% sample of the data as test sample
sample_train=clean_train.sample(frac=0.10)

In [10]:
#The new shape of the data
sample_train.shape

(2021610, 14)

In [14]:
#Spliting into features and target, so we remove the target
X = sample_train.drop('meter_reading', axis = 1)
Y = sample_train['meter_reading']

In [12]:
X.shape

(2021610, 13)

In [15]:
Y.shape

(2021610,)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [17]:
#Standardizing the features we using standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [18]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 20, random_state =30)
model.fit(X_train_scaled,Y_train)

RandomForestRegressor(n_estimators=20, random_state=30)

In [22]:
Y_predict = model.predict(X_test_scaled)

In [23]:
X_test_scaled.shape

(606483, 13)

In [26]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [27]:
mse = mean_squared_error(Y_test,Y_predict)
mbe = mean_absolute_error(Y_test,Y_predict)

In [28]:
mse

3823619050.922644

In [29]:
mbe

733.8462913679524

In [82]:

def rmsle_metric(Y_test,Y_predict):
  #Convert the pandas series to array
  Y_test_array = Y_test.to_numpy()
  n = len(Y_test_array)
  value_list = []
  for i in range(n):
    pri = Y_predict[i]
    act = Y_test_array[i]
    log_pred = np.log(pri + 1)
    log_act = np.log(act + 1)
    value = (log_pred - log_act)**2
    value_list.append(value)

  msle = sum(value_list)/n
  rmsle = np.sqrt(msle)
  return rmsle


In [83]:
rmsle_value = rmsle_metric(Y_test,Y_predict)
rmsle_value

0.898940671525392