In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
import lightgbm as lgb

In [2]:
train_data=lgb.Dataset('train/train.csv')

In [3]:
params = {
    'num_leaves': 127,
    'objective': 'regression',
    'min_data_in_leaf': 100,
    'learning_rate': 0.015,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'metric': 'l2',
    'nthread': 32,
    'lambda_l1':0.10,
    'lambda_l2':0.10,
    'seed':1123,
    'header':True,
    'label':0
}


In [4]:
lgb_cb = lgb.cv(
    params,
    train_data,
    num_boost_round=10000,
    early_stopping_rounds=50,
    verbose_eval=50,    
)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[50]	cv_agg's l2: 4.03256 + 1.11507
[100]	cv_agg's l2: 3.69836 + 1.03841
[150]	cv_agg's l2: 3.51306 + 0.979809
[200]	cv_agg's l2: 3.39547 + 0.940566
[250]	cv_agg's l2: 3.30716 + 0.903774
[300]	cv_agg's l2: 3.24006 + 0.87337
[350]	cv_agg's l2: 3.18936 + 0.850402
[400]	cv_agg's l2: 3.1514 + 0.834578
[450]	cv_agg's l2: 3.12001 + 0.820348
[500]	cv_agg's l2: 3.0952 + 0.811641
[550]	cv_agg's l2: 3.0763 + 0.802665
[600]	cv_agg's l2: 3.06034 + 0.794587
[650]	cv_agg's l2: 3.04661 + 0.786678
[700]	cv_agg's l2: 3.03395 + 0.781297
[750]	cv_agg's l2: 3.02353 + 0.775839
[800]	cv_agg's l2: 3.01376 + 0.770007
[850]	cv_agg's l2: 3.00409 + 0.762937
[900]	cv_agg's l2: 2.99614 + 0.757055
[950]	cv_agg's l2: 2.98882 + 0.752988
[1000]	cv_agg's l2: 2.98246 + 0.748301
[1050]	cv_agg's l2: 2.97517 + 0.742293
[1100]	cv_agg's l2: 2.96793 + 0.736328
[1150]	cv_agg's l2: 2.9624 + 0.732803
[1200]	cv_agg's l2: 2.95598 + 0.726755
[1250]	cv_agg's l2: 2.95035 + 0.721473
[1300]	cv_agg's l2: 2.94534 + 0.717779
[1350]	cv_agg

In [5]:
nround = lgb_cb['l2-mean'].index(np.min(lgb_cb['l2-mean']))

In [6]:
nround

4436

In [7]:
bst=lgb.train(
    params,
    train_data,
    num_boost_round=nround,
    verbose_eval=50
)

In [8]:
test_data=pd.read_csv('train/test.csv')

In [9]:
test_data.head()

Unnamed: 0,test_id,create_hour,text,code,temperature,feels_like,pressure,humidity,visibility,wind_direction,...,petrol_station_y,supermarket_y,uptown_y,subway_station_y,bus_station_y,coffee_House_y,restaurant_y,atm_y,office_building_y,hotel_y
0,0,21,1.0,4.0,29.0,29.0,1003.0,70.0,10.0,4.0,...,2.0,79.0,266.0,7.0,36.0,95.0,898.0,113.0,133.0,151.0
1,1,21,1.0,4.0,29.0,29.0,1003.0,70.0,10.0,4.0,...,2.0,61.0,152.0,4.0,34.0,115.0,900.0,114.0,167.0,117.0
2,2,21,1.0,4.0,29.0,29.0,1003.0,70.0,10.0,4.0,...,2.0,47.0,244.0,6.0,35.0,157.0,898.0,107.0,177.0,186.0
3,3,17,1.0,4.0,31.0,31.0,1001.0,64.0,13.4,4.0,...,2.0,61.0,152.0,4.0,34.0,115.0,900.0,114.0,167.0,117.0
4,4,17,1.0,4.0,31.0,31.0,1001.0,64.0,13.4,4.0,...,2.0,40.0,324.0,7.0,55.0,42.0,693.0,164.0,174.0,132.0


In [10]:
test_id=test_data['test_id']
test_data.drop(['test_id'],axis=1,inplace=True)

In [11]:
test_data.head()

Unnamed: 0,create_hour,text,code,temperature,feels_like,pressure,humidity,visibility,wind_direction,wind_direction_degree,...,petrol_station_y,supermarket_y,uptown_y,subway_station_y,bus_station_y,coffee_House_y,restaurant_y,atm_y,office_building_y,hotel_y
0,21,1.0,4.0,29.0,29.0,1003.0,70.0,10.0,4.0,129.0,...,2.0,79.0,266.0,7.0,36.0,95.0,898.0,113.0,133.0,151.0
1,21,1.0,4.0,29.0,29.0,1003.0,70.0,10.0,4.0,129.0,...,2.0,61.0,152.0,4.0,34.0,115.0,900.0,114.0,167.0,117.0
2,21,1.0,4.0,29.0,29.0,1003.0,70.0,10.0,4.0,129.0,...,2.0,47.0,244.0,6.0,35.0,157.0,898.0,107.0,177.0,186.0
3,17,1.0,4.0,31.0,31.0,1001.0,64.0,13.4,4.0,124.0,...,2.0,61.0,152.0,4.0,34.0,115.0,900.0,114.0,167.0,117.0
4,17,1.0,4.0,31.0,31.0,1001.0,64.0,13.4,4.0,124.0,...,2.0,40.0,324.0,7.0,55.0,42.0,693.0,164.0,174.0,132.0


In [12]:
ypred = bst.predict(test_data)

In [13]:
ypred

array([ 18.34927644,  10.84051368,   8.17009857, ...,   1.33679928,
         1.46260104,   1.16174553])

In [16]:
submission = pd.DataFrame({
        "test_id": test_id,
    })

In [18]:
submission['count']=ypred

In [19]:
submission.head()

Unnamed: 0,test_id,count
0,0,18.349276
1,1,10.840514
2,2,8.170099
3,3,9.257586
4,4,8.130158


In [20]:
submission.to_csv('data/submission/sub_1.csv',index=False)