In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

In [2]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

test_df['primary_use'] = le.fit_transform(test_df['primary_use']).astype(np.int8)

In [4]:
#change meter_reading value for site_0 train data according to Sohier Dane from Kaggle Team
#https://www.kaggle.com/c/ashrae-energy-prediction/discussion/119261
train_df[train_df["site_id"]==0].meter_reading = train_df[train_df["site_id"]==0].meter_reading * 0.2931

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [5]:
test_df["month"] = test_df["timestamp"].apply(lambda x: int(x[5:7]))
test_df["hour"] = test_df["timestamp"].apply(lambda x: int(x[11:13]))

In [6]:
#handling missing values
train_df['floor_count'] = train_df['floor_count'].fillna(-999).astype(np.int16)
test_df['floor_count'] = test_df['floor_count'].fillna(-999).astype(np.int16)

train_df['year_built'] = train_df['year_built'].fillna(-999).astype(np.int16)
test_df['year_built'] = test_df['year_built'].fillna(-999).astype(np.int16)

train_df['age'] = train_df['age'].fillna(-999).astype(np.int16)
test_df['age'] = test_df['age'].fillna(-999).astype(np.int16)

train_df['cloud_coverage'] = train_df['cloud_coverage'].fillna(-999).astype(np.int16)
test_df['cloud_coverage'] = test_df['cloud_coverage'].fillna(-999).astype(np.int16) 

In [7]:
#del train_df["timestamp"], test_df["timestamp"]
categoricals = ["site_id", "building_id", "primary_use",  "meter",  "month", "hour", "day_of_week"]
drop_cols = ["sea_level_pressure", "wind_speed","wind_direction"]

numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage",
              "dew_temperature", 'precip_depth_1_hr', 'floor_count', 'beaufort_scale']

feat_cols = categoricals + numericals
target = np.log1p(train_df["meter_reading"])

del train_df["meter_reading"] 

train_df = train_df.drop(drop_cols, axis = 1)

In [8]:
#K-fold
params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample_freq': 1,
            'learning_rate': 0.3,
            'bagging_freq': 5,
            'num_leaves': 330,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }

folds = 5
seed = 666
shuffle = False
kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)



Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 0.719392	valid_1's rmse: 0.949006
[100]	training's rmse: 0.670712	valid_1's rmse: 0.942172
[150]	training's rmse: 0.644746	valid_1's rmse: 0.94043
[200]	training's rmse: 0.626315	valid_1's rmse: 0.938185
[250]	training's rmse: 0.613943	valid_1's rmse: 0.937272
Early stopping, best iteration is:
[229]	training's rmse: 0.618802	valid_1's rmse: 0.937102
Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 0.701863	valid_1's rmse: 1.30742
[100]	training's rmse: 0.653135	valid_1's rmse: 1.3053
[150]	training's rmse: 0.628593	valid_1's rmse: 1.31069
Early stopping, best iteration is:
[124]	training's rmse: 0.642299	valid_1's rmse: 1.30418
Training until validation scores don't improve for 50 rounds.
[50]	training's rmse: 0.700169	valid_1's rmse: 1.02942
Early stopping, best iteration is:
[28]	training's rmse: 0.742366	valid_1's rmse: 1.01955
Training until validation scores don'

In [None]:
#Random forest


In [None]:
#lightgbm
lgbm = []
for train_index, val_index in kf.split(train_df[feat_cols], train_df['building_id']):
    train_X = train_df[feat_cols].iloc[train_index]
    val_X = train_df[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categoricals)
    lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categoricals)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=50,
                verbose_eval = 50)
    lgbm.append(gbm)

In [9]:
del train_df #, train_X, val_X, lgb_train, lgb_eval, train_y, val_y, target
gc.collect()


340

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 21 columns):
row_id                int32
building_id           int16
meter                 uint8
timestamp             object
site_id               uint8
primary_use           int8
square_feet           int32
year_built            int16
floor_count           int16
air_temperature       float16
cloud_coverage        int16
dew_temperature       float16
precip_depth_1_hr     float16
sea_level_pressure    float16
wind_direction        uint8
wind_speed            float16
year                  int64
age                   int16
beaufort_scale        uint8
month                 int64
hour                  int64
dtypes: float16(5), int16(5), int32(2), int64(3), int8(1), object(1), uint8(4)
memory usage: 2.5+ GB


In [11]:
test_df["day_of_week"] = pd.to_datetime(test_df["timestamp"]).apply(lambda x: x.weekday())
del test_df["timestamp"]

In [None]:
# Reduce memory size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
test_df = reduce_mem_usage(test_df)

In [12]:
test_df = test_df[feat_cols]

In [13]:
#lgbm testing
i=0
result=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_df.shape[0]/50000)))):
    result.append(np.expm1(sum([lgbm.predict(test_df.iloc[i:i+step_size]) for model in lgbm])/folds))
    i+=step_size

100%|██████████| 834/834 [12:07<00:00,  1.08it/s]


In [14]:
result = np.concatenate(result)

In [15]:
sample_submission = pd.DataFrame()

In [16]:
#submission
sample_submission['meter_reading'] = result
sample_submission.loc[sample_submission['meter_reading']<0, 'meter_reading'] = 0
#sample_submission.to_csv('submission.csv', index=False)

In [17]:
sample_submission.head(10)

Unnamed: 0,meter_reading
0,1.322874
1,0.843214
2,0.3525
3,2.09704
4,1.523182
5,0.294039
6,1.146169
7,2.597727
8,124.692094
9,1.534807


In [18]:
sample_submission["row_id"] = range(0, len(sample_submission.meter_reading),1)

In [19]:
sample_submission = sample_submission[['row_id','meter_reading']]

In [20]:
len(sample_submission.row_id)

41697600

In [21]:
sample_submission.to_csv('submission.csv', index=False)

In [3]:
import pandas as pd

sample = pd.read_csv('submission.csv')

In [4]:
sample.head(10)

Unnamed: 0,row_id,meter_reading
0,0,1.322874
1,1,0.843214
2,2,0.3525
3,3,2.09704
4,4,1.523182
5,5,0.294039
6,6,1.146169
7,7,2.597727
8,8,124.692094
9,9,1.534807
