In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
warnings.simplefilter('ignore')
matplotlib.rcParams['figure.dpi'] = 100
sns.set()
%matplotlib inline

In [None]:
'''Function to reduce the DF size'''
# source: https://www.kaggle.com/kernels/scriptcontent/3684066/download

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
'''Function to distribution plot'''
def distplot(variable, color):
    #global ax
    fig,ax = plt.subplots(1,1,figsize=(18,10))
    font_size = 16
    title_size = 20
    #plt.rcParams['figure.figsize'] = (18, 10)
    ax = sns.distplot(variable, color = color)
    ax.set_xlabel('%s' %variable.name, fontsize = font_size)
    ax.set_ylabel('Count ', fontsize = font_size)
    ax.set_xticks(fontsize = font_size)
    ax.set_yticks(fontsize = font_size)
    ax.set_title(' Distribution of '+'%s' %variable.name, fontsize = title_size)
    #plt.show()

In [2]:
data_dir = '/home/luoyuhao/Datasets/kaggle/energy'
#building = pd.read_csv(data_dir + '/building_metadata.csv')
weather_train = pd.read_csv(data_dir + '/weather_train.csv')


In [6]:
# Saving some memory
d_types = {
          'site_id': np.int8,
          'air_temperature': np.float32,
          'cloud_coverage': np.float16,
          'dew_temperature': np.float32,
          'precip_depth_1_hr': np.float16,
          'sea_level_pressure': np.float32,
          'wind_direction': np.float16,
          'wind_speed': np.float32}

for feature in d_types:
    weather_train[feature] = weather_train[feature].astype(d_types[feature])
   
    
weather_train["timestamp"] = pd.to_datetime(weather_train["timestamp"])
gc.collect();
display(weather_train.head())

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [7]:
air_temp_non = weather_train[weather_train["air_temperature"].isna()]
display(air_temp_non)


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
368,0,2016-01-16 08:00:00,,,,0.0,,180.0,1.5
369,0,2016-01-16 09:00:00,,2.0,,0.0,,190.0,1.5
370,0,2016-01-16 10:00:00,,2.0,,0.0,,190.0,1.5
15763,1,2016-10-18 13:00:00,,,,,,290.0,6.2
22349,2,2016-07-19 02:00:00,,,,86.0,,,
27393,3,2016-02-14 07:00:00,,0.0,,0.0,,,
30910,3,2016-07-09 20:00:00,,,,38.0,,,
30911,3,2016-07-09 21:00:00,,,,8.0,,,
30914,3,2016-07-10 00:00:00,,4.0,,,,,
50108,5,2016-09-17 00:00:00,,,,,,310.0,4.6


In [72]:
idx = np.where(weather_train["air_temperature"].isna())
#print(weather_train.iloc[368])
#weather_train.loc[368]['air_temperature'] = 5
#print(weather_train.iloc[368])
print(weather_train.shape)
weather_train.dtypes
summary = pd.DataFrame(weather_train.dtypes,columns=['dtypes'])
summary.reset_index()

summary['count'] = weather_train.count().values
summary['missing'] = weather_train.isnull().sum().values
summary['unique'] = weather_train.nunique().values
summary['min'] = weather_train.min().values
summary['max'] = weather_train.max().values
#summary['median'] = weather_train.median().values
summary

(139773, 9)


Unnamed: 0,dtypes,count,missing,unique,min,max
site_id,int8,139773,0,16,0,15
timestamp,datetime64[ns],139773,0,8784,2016-01-01 00:00:00,2016-12-31 23:00:00
air_temperature,float32,139718,55,619,-28.9,47.2
cloud_coverage,float16,70600,69173,10,0,9
dew_temperature,float32,139660,113,522,-35,26.1
precip_depth_1_hr,float16,89484,50289,128,-1,343
sea_level_pressure,float32,129155,10618,709,968.2,1045.5
wind_direction,float16,133505,6268,43,0,360
wind_speed,float32,139469,304,58,0,19


In [84]:
hour = np.uint8(weather_train['timestamp'].dt.hour)
day = np.uint8(weather_train['timestamp'].dt.day)
weekday = weather_train['timestamp'].dt.weekday
weekday_name = weather_train['timestamp'].dt.weekday_name
month = weather_train['timestamp'].dt.month


In [89]:
summary['missing'].replace({0:999},inplace=True)
col = summary.loc[:,['count','missing']]
for c in col.columns:
    print(c)

count
missing


In [31]:
def weather_process(dataframe,key):
    na_idx = np.where(dataframe[key].isna())[0]
    print(na_idx.shape)
    for cur_idx in na_idx:
        pre_idx = cur_idx-1
        while pre_idx:
            if dataframe.loc[pre_idx][key] != np.nan:
                value = dataframe.loc[pre_idx][key]
                print(value)
                dataframe.loc[cur_idx] = value
                break
            else:
                pre_idx -= 1
    
    print(dataframe.loc[368][key])

In [None]:
## BASELINE
## Feature Proc For Training Data

In [None]:
## 数值型数据处理
## year
train['year_built'] = np.uint8(train['year_built']-1900,inplace=True)

## time
train['hour'] = np.uint8(train['timestamp'].dt.hour)
train['day'] = np.uint8(train['timestamp'].dt.day)
train['weekday'] = train['timestamp'].dt.weekday
train['month'] = train['timestamp'].dt.month

## 先简单的对缺少值取中值处理（离散型或者连续型数值类型数据
to_impute_by_median = train.loc[:, ['floor_count','air_temperature', 'cloud_coverage', 'dew_temperature',
                      'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction','wind_speed']]
for col in to_impute_by_median:
    train[col].fillna(train[col].median(),inplace=True)

In [None]:
## 类别数据处理
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['primary_use'] = le.fit_transform(train['primary_use'])

In [None]:
'''Setting train, test and target for model'''
target = train_processed['meter_reading']
train = train_processed.drop(['meter_reading'], axis = 1)
test = test_processed.drop(['row_id'], axis = 1)

In [None]:
'''Merging datasets'''
train = train.merge(building, on = 'building_id', how = 'left')
test = test.merge(building, on = 'building_id', how = 'left')

train = train.merge(weather_train, on = ['site_id', 'timestamp'], how = 'left')
test = test.merge(weather_test, on = ['site_id', 'timestamp'], how = 'left')

del weather_train, weather_test,building

In [None]:
train = reduce_mem_usage(train)