In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer
from sklearn.preprocessing import label_binarize

In [20]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [21]:
from preprocess import preprocess_train, preprocess, FEATURES, CATEGORICAL_FEATURES, TEST_FEATURES, CATEGORICAL_TEST_FEATURES_IDX

In [22]:
X, y = preprocess_train(train, categotical_features=CATEGORICAL_FEATURES, features=TEST_FEATURES)

In [23]:
X, y = shuffle(X, y)

In [24]:
y.shape

(8716,)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75)

## Feature preprocessing

In [45]:
def encode_categorical(X, cat_feat):
    '''
    Encodes categorical features with one-hot encoding and adds it into model
    
    Parameters
    ----------
    X: numpy.ndarray
        Training features
    cat_feat: list of int
        Categorical features indices
    
    Returns
    -------
    tweaked_X: numpy.ndarray
        Tweaked X
        
    '''
    
    # All the rest
    rest = np.ones(X.shape[1], np.bool)
    rest[cat_feat] = False
    
    X_rest = X[:, rest]
    
    # Encoded
    one_hot_encoded = []
    
    for col_idx in cat_feat:  
        encoded = label_binarize(X[:, col_idx], np.unique(X[:, col_idx]).astype(int))
        
        print encoded.shape
        
        one_hot_encoded.append(
            encoded
        )
    
    one_hot_encoded.append(X_rest)
    
    return np.hstack(one_hot_encoded)

In [47]:
encoded_X = encode_categorical(X, CATEGORICAL_TEST_FEATURES_IDX)

(8716, 3)
(8716, 1)
(8716, 1)
(8716, 1)
(8716, 1)
(8716, 1)
(8716, 1)
(8716, 1)
(8716, 1)
(8716, 1)


In [48]:
print X.shape, encoded_X.shape

(8716, 44) (8716, 46)


## Linear Regression

#### CrossVal mean scores
* catboost_d10: 224.01548277848525
* catboost_d16: 

In [54]:
# Training simple model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [55]:
lr = Lasso()

In [56]:
scorer = make_scorer(lambda a, b: mean_squared_error(a, b)**.5)
scores = cross_val_score(lr, encoded_X, y, cv=5, scoring=scorer, verbose=1)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.8s finished


In [57]:
np.mean(scores)

340.449211318938

In [36]:
y_pred = cbr.fit(X_train, y_train, cat_features=CATEGORICAL_TEST_FEATURES_IDX).predict(X_test)
print "Mean squared error: {}".format(mean_squared_error(y_test, y_pred)**.5)

Mean squared error: 284.83019248


In [13]:
# Make submission
cbr.fit(X, y, cat_features=CATEGORICAL_TEST_FEATURES_IDX)

<catboost.core._CatBoostBase at 0x7f8a7f594a50>

In [14]:
test = preprocess(test, CATEGORICAL_FEATURES)

In [15]:
X = test[TEST_FEATURES].values

In [16]:
predictions = cbr.predict(X)

In [17]:
predictions

array([ 879.78794459, 1165.70020084,  279.39788828, ...,  154.47198918,
        -62.34725293,   36.55400176])

In [18]:
def make_submission(ids, predictions):
    df = pd.concat([ids, pd.Series(predictions)], axis=1)
    return df.rename(columns={0: 'value'})

In [19]:
l = test.id

In [20]:
df = pd.concat([test.id, pd.Series(predictions)], axis=1)

In [21]:
df = df.rename(columns={0: 'value'})

In [22]:
df.to_csv("catboost_d10.csv", index=False)