In [1]:
import pandas as pd, numpy as np

In [2]:
seed = 1
np.random.seed(seed)

## Load data

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
train_df.shape, test_df.shape

((4209, 378), (4209, 377))

In [4]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


# preprocessing

In [5]:
from sklearn.preprocessing import LabelEncoder

In [13]:
# train_input = train_df.drop("ID") First, experiment with keeping id
train_input = train_df.copy()
test_input = test_df.copy()

In [14]:
num_train =len( train_input)
num_train

4209

In [15]:
full_input = pd.concat([train_input, test_input])
full_input.shape

(8418, 378)

In [18]:
categorical_cols = full_input.select_dtypes(include=[np.object]).columns.tolist()
categorical_cols

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

In [19]:
for c in categorical_cols:
    full_input[c] = LabelEncoder().fit_transform(full_input[c].values)

In [21]:
full_input.head()

Unnamed: 0,ID,X0,X1,X10,X100,X101,X102,X103,X104,X105,...,X91,X92,X93,X94,X95,X96,X97,X98,X99,y
0,0,37,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,130.81
1,6,37,21,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,88.53
2,7,24,24,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,76.26
3,9,24,21,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,80.62
4,13,24,23,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,78.02


In [22]:
train_input = full_input[:num_train]
test_input = full_input[num_train:]
test_input.drop('y', axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
train_input.shape, test_input.shape

((4209, 378), (4209, 377))

## model

In [23]:
import lightgbm as lgb

In [75]:
train_data = lgb.Dataset(train_input.drop('y', axis=1).values, 
                         label=train_input['y'].values,
                         feature_name=train_input.drop('y', axis=1).columns.tolist(), 
                         categorical_feature=categorical_cols + ['ID'])

## cv evaluation

In [26]:
from sklearn.metrics import r2_score

In [31]:
def eval_r2_score(preds, train_data):
    is_higher_better = True
    labels = train_data.get_label()
    return 'r2', r2_score(labels, preds), is_higher_better

In [60]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 5,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

In [61]:
num_round = 1000

In [63]:
cv_results = lgb.cv(params, train_data, num_round, nfold=5, early_stopping_rounds=50, 
       feval=eval_r2_score, seed=seed, verbose_eval=20, shuffle=True)

[20]	cv_agg's l2: 81.8638 + 7.73807	cv_agg's r2: 0.490331 + 0.0307801
[40]	cv_agg's l2: 71.4859 + 8.20636	cv_agg's r2: 0.555182 + 0.0361683
[60]	cv_agg's l2: 69.9293 + 8.40974	cv_agg's r2: 0.564924 + 0.0378535
[80]	cv_agg's l2: 69.551 + 8.54738	cv_agg's r2: 0.567305 + 0.0388336
[100]	cv_agg's l2: 69.4495 + 8.47671	cv_agg's r2: 0.567912 + 0.0385232
[120]	cv_agg's l2: 69.4354 + 8.47343	cv_agg's r2: 0.56801 + 0.0383954
[140]	cv_agg's l2: 69.5994 + 8.48109	cv_agg's r2: 0.566993 + 0.0383779


In [68]:
gbm = lgb.train(params, train_data, num_boost_round=num_round, verbose_eval=10)

In [77]:
kaggle_preds = gbm.predict(test_input.values)
kaggle_preds.shape

(4209,)

In [79]:
test_df['y'] = kaggle_preds
test_df[['ID', 'y']].head()

Unnamed: 0,ID,y
0,1,79.069446
1,2,95.031323
2,3,78.06537
3,4,78.209953
4,5,115.343554


In [80]:
test_df[['ID', 'y']].to_csv("../data/submission_lightgbm_benchmark.csv", index=False)