In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter("ignore")
import gc, math

pd.set_option('display.max_columns', None)

In [2]:
FOLDER = "../input/"
train_df = pd.read_csv(FOLDER + "train.csv")
test_df = pd.read_csv(FOLDER + "test.csv")

building_df = pd.read_csv(FOLDER + "building_metadata.csv")

train_weather_df = pd.read_csv(FOLDER + "weather_train.csv")
test_weather_df = pd.read_csv(FOLDER + "weather_test.csv")

In [3]:
print("train/test shape is:", train_df.shape, test_df.shape)

train/test shape is: (20216100, 4) (41697600, 4)


In [6]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


In [5]:
test_df.head(30)

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01 00:00:00
1,1,1,0,2017-01-01 00:00:00
2,2,2,0,2017-01-01 00:00:00
3,3,3,0,2017-01-01 00:00:00
4,4,4,0,2017-01-01 00:00:00
5,5,5,0,2017-01-01 00:00:00
6,6,6,0,2017-01-01 00:00:00
7,7,7,0,2017-01-01 00:00:00
8,8,7,1,2017-01-01 00:00:00
9,9,8,0,2017-01-01 00:00:00


In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
for df in [train_df, test_df, train_weather_df, test_weather_df]:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
for df in [train_df, test_df]:
    df['DT_M'] = df['timestamp'].dt.month.astype(np.int8)
    df['DT_W'] = df['timestamp'].dt.weekofyear.astype(np.int8)
    df['DT_D'] = df['timestamp'].dt.dayofyear.astype(np.int16)
    
    df['DT_hour'] = df['timestamp'].dt.hour.astype(np.int8)
    df['DT_day_week'] = df['timestamp'].dt.dayofweek.astype(np.int8)
    df['DT_day_month'] = df['timestamp'].dt.day.astype(np.int8)
    df['DT_week_month'] = df['timestamp'].dt.day/7
    df['DT_week_month'] = df['DT_week_month'].apply(lambda x: math.ceil(x)).astype(np.int8)

In [5]:
print("train/test shape is:", train_df.shape, test_df.shape)
print("weather train/test shape is:", train_weather_df.shape, test_weather_df.shape)
print("building info shape is:", building_df.shape)

train/test shape is: (20216100, 11) (41697600, 11)
weather train/test shape is: (139773, 9) (277243, 9)
building info shape is: (1449, 6)


In [6]:
print("there are", len(train_df.building_id.unique()), "unique buildings in training")
print("there are", len(train_df.meter.unique()), "unique meters in training")

print("there are", len(test_df.building_id.unique()), "unique buildings in testing")
print("there are", len(test_df.meter.unique()), "unique meters in tesing")


there are 1449 unique buildings in training
there are 4 unique meters in training
there are 1449 unique buildings in testing
there are 4 unique meters in tesing


In [7]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,DT_M,DT_W,DT_D,DT_hour,DT_day_week,DT_day_month,DT_week_month
0,0,0,2016-01-01,0.0,1,53,1,0,4,1,1
1,1,0,2016-01-01,0.0,1,53,1,0,4,1,1
2,2,0,2016-01-01,0.0,1,53,1,0,4,1,1
3,3,0,2016-01-01,0.0,1,53,1,0,4,1,1
4,4,0,2016-01-01,0.0,1,53,1,0,4,1,1


In [8]:
building_df.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [9]:
train_weather_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [10]:
# building_df['primary_use'] = building_df['primary_use'].astype('category')

In [11]:
building_df['floor_count'] = building_df['floor_count'].fillna(0).astype(np.int8)
building_df['year_built'] = building_df['year_built'].fillna(-999).astype(np.int16)

# le = LabelEncoder()
# building_df['primary_use'] = building_df['primary_use'].astype(str)
# building_df['primary_use'] = le.fit_transform(building_df['primary_use']).astype(np.int8)

In [12]:
do_not_convert = ['category','datetime64[ns]','object']
for df in [train_df, test_df, building_df, train_weather_df, test_weather_df]:
    original = df.copy()
    df = reduce_mem_usage(df)

    for col in list(df):
        if df[col].dtype.name not in do_not_convert:
            if (df[col]-original[col]).sum()!=0:
                df[col] = original[col]
                print('Bad transformation', col)

Mem. usage decreased to 443.43 Mb (42.5% reduction)
Bad transformation meter_reading
Mem. usage decreased to 914.62 Mb (42.5% reduction)
Mem. usage decreased to  0.02 Mb (48.5% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Bad transformation air_temperature
Bad transformation dew_temperature
Bad transformation sea_level_pressure
Bad transformation wind_speed
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Bad transformation air_temperature
Bad transformation dew_temperature
Bad transformation sea_level_pressure
Bad transformation wind_speed


In [13]:
train_df.to_pickle(FOLDER+'train.pkl')
test_df.to_pickle(FOLDER+'test.pkl')

building_df.to_pickle(FOLDER+'building_metadata.pkl')

train_weather_df.to_pickle(FOLDER+'weather_train.pkl')
test_weather_df.to_pickle(FOLDER+'weather_test.pkl')