In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing
# import seaborn as sns
%matplobtlib inline
import xgboost as xgb

### Read the data and check the shapes

In [138]:
train_df = pd.read_csv('./data/train.csv')
x_test_df = pd.read_csv('./data/test.csv')

print('training data shape: ', train_df.shape)
print('test data shape: ', x_test_df.shape)

('training data shape: ', (30471, 292))
('test data shape: ', (7662, 291))


In [139]:
y_train = train_df['price_doc'].as_matrix()
x_train_df = train_df.drop(['price_doc'],1)
print('x_train_df shape: ',x_train_df.shape)
print('x_test_df shape: ',x_test_df.shape)

#we should on-hot encode test and train togather as labels should math
x_all_df = x_train_df.append(x_test_df)
string_columns = x_train_df.select_dtypes(include=['object']).columns.tolist()

#one hot encode string columns
x_all_onehot_df = pd.get_dummies(x_all_df,columns=string_columns)
print('shape after onehot encoding: ', x_all_onehot_df.shape)

#again separate train and test features
x_train_onehot = x_all_onehot_df[0:x_train_df.shape[0]].as_matrix()
x_test_onehot = x_all_onehot_df[x_train_df.shape[0]::].as_matrix()

('x_train_df shape: ', (30471, 291))
('x_test_df shape: ', (7662, 291))
('shape after onehot encoding: ', (38133, 1887))


### Train the model

In [140]:
model = xgb.XGBRegressor()
model.fit(x_train_onehot, y_train)

XGBRegressor(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, nthread=-1, objective='reg:linear', seed=0,
       silent=True, subsample=1)

In [141]:
y_test = model.predict(x_test_onehot)
# y_pred = model.predict()

### Submission 1
So we are submitting without any feature engineering or parameter optmization for xgboost. Some columns could also be wrongly classified as string columns and one-hot encoded

In [142]:
submission = pd.DataFrame({
    'id': x_test['id'],
    'price_doc': y_test
})
submission.to_csv('./data/submission_v0.12.csv',index=False)

#Ranked 836 and score was 0.32577

### Submission 2

In [169]:
train_df = pd.read_csv('./data/train.csv')
x_test_df = pd.read_csv('./data/test.csv')

print('training data shape: ', train_df.shape)
print('test data shape: ', x_test_df.shape)

y_train = train_df['price_doc'].as_matrix()
x_train_df = train_df.drop(['price_doc'],1)
print('x_train_df shape: ',x_train_df.shape)
print('x_test_df shape: ',x_test_df.shape)

#we should on-hot encode test and train togather as labels should math
x_all_df = x_train_df.append(x_test_df)
string_columns = x_train_df.select_dtypes(include=['object']).columns.tolist()

#let us check what the string columns are
string_columns

('training data shape: ', (30471, 292))
('test data shape: ', (7662, 291))
('x_train_df shape: ', (30471, 291))
('x_test_df shape: ', (7662, 291))


['timestamp',
 'product_type',
 'sub_area',
 'culture_objects_top_25',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'water_1line',
 'big_road1_1line',
 'railroad_1line',
 'ecology']

Timestamp is getting identifed as string. Lets fix this

In [170]:
#create new features from timestamp
x_dt = pd.to_datetime(x_all_df['timestamp'],format='%Y-%m-%d')
x_all_df['year'] = x_dt.dt.year
x_all_df['month'] =  x_dt.dt.month
x_all_df['day'] = x_dt.dt.day
x_all_df['quarter'] = x_dt.dt.quarter
x_all_df.drop('timestamp', axis=1,inplace=True)
# x_all_df.drop('datetime')

string_columns = x_all_df.select_dtypes(include=['object']).columns.tolist()

#one hot encode string columns
x_all_onehot_df = pd.get_dummies(x_all_df,columns=string_columns)
print('shape after onehot encoding: ', x_all_onehot_df.shape)

#again separate train and test features
x_train_onehot = x_all_onehot_df[0:x_train_df.shape[0]].as_matrix()
x_test_onehot = x_all_onehot_df[x_train_df.shape[0]::].as_matrix()

#train
model = xgb.XGBRegressor()
model.fit(x_train_onehot, y_train)

#predict
y_test = model.predict(x_test_onehot)


submission = pd.DataFrame({
    'id': x_test['id'],
    'price_doc': y_test
})
submission.to_csv('./data/submission_v0.2.csv',index=False)

#result is 0.33636, which is worse than submisison 1


('shape after onehot encoding: ', (38133, 456))
