In [1]:
# Imports

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate as cv
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
# RMSLE Metric formula

NUM_FOLDS = 5

def rmsle_cv(model, X, y):
    kf = KFold(NUM_FOLDS, shuffle=True, random_state=0).get_n_splits(X)
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [3]:
# Read in the data

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

y = train.loc[:,'target'].values
train.drop(['ID','target'], axis=1, inplace=True)

# Quick/Dirty Regression Model (Baseline)

In [None]:
# # Linear Regression Model function

# lr = LinearRegression()
# rmsle_lr = rmsle_cv(lr, train, y)
# print(np.mean(rmsle_lr), np.std(rmsle_lr))
# # 1.7E15, 2.7E15

The quick dirty regression model got a root mean squared error of 1.7E15. Not only that but the algorithm was spitting out outrageous numbers suchs as negative prices or 10e10 prices. After doing a little research about this situation, two solutions was discovered. Decreasing the dimenstions or using a different model.

# Scaling Features

In [4]:
# Scaling the features

sc = StandardScaler()
y = np.log1p(y)

# Feature Selection

## PCA

Before we perform PCA, we will remove some correlated features. Research showed that it provides a better PCA transformation when correlated features are removed. The reason is because if two variables are correlated, then their variance will be twice as high, which will consider it to be a higher component then it really is. [link: https://stats.stackexchange.com/questions/50537/should-one-remove-highly-correlated-variables-before-doing-pca] 

In [5]:
# Create correlation matrix
corr_matrix = train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [6]:
# Dropping correlating columns

train.drop(train[to_drop], axis=1, inplace=True)

In [7]:
# Scaling and PCA data on correlated data

train_std_corr = sc.fit_transform(train)
# pca_corr = PCA(n_components=None)
# pca_corr.fit(train_std_corr)
# var_exp_corr = pca_corr.explained_variance_ratio_

In [None]:
# # Plotting the variances

# cum_var_exp = np.cumsum(var_exp_corr)
# plt.bar(range(1,len(var_exp_corr)+1), var_exp_corr, alpha=0.5, align='center', label='individual explained variance')
# plt.step(range(1,len(var_exp_corr)+1), cum_var_exp, where='mid', label='cumulative explained variance')
# plt.ylabel('Explained variance ratio')
# plt.xlabel('Principal components')
# plt.legend(loc='best')
# plt.show()

In [8]:
# Running PCA with 1000 components and transforming data

pca_corr = PCA(n_components=1000)
train_pca_corr = pca_corr.fit_transform(train_std_corr)

# Models

## Linear Regression

In [None]:
# # Running a Quick Linear Regression on the transformed data

# lr = LinearRegression()
# rmsle_lr = rmsle_cv(lr, train_pca_corr, y)
# print(np.mean(rmsle_lr), np.std(rmsle_lr))
# # RMLSE: 3.8 STD: 0.13

## Ridge Regression

In [None]:
# # Tuning alpha for Ridge Regression

# alpha = [3, 2, 1.5, 1, .75]

# ridge_scores = []
# for i in alpha:
#     ridge = Ridge(alpha=i,normalize=True)
#     rmsle_ridge = rmsle_cv(ridge, train_pca_corr, y)
#     ridge_scores.append((np.mean(rmsle_ridge), np.std(rmsle_ridge)))
# ridge_scores

In [None]:
# # Setting the parameters for the Ridge Regression and running the model

# ridge = Ridge(alpha=1, normalize=True)
# rmsle_ridge = rmsle_cv(ridge, train_pca_corr, y)
# print(np.mean(rmsle_ridge), np.std(rmsle_ridge))
# # Mean: 1.693, STD: 0.041

## Random Forest

In [None]:
# # Create the parameter grid based on the results of random search 
# param_grid = {'n_estimators': [200, 300],
#               'min_samples_leaf': [4, 5],
#               'min_samples_split': [6, 8]}

# # Create a based model
# rf = RandomForestRegressor()

# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 2, n_jobs = -1, verbose = 2)

# # Fitting the training data
# grid_search.fit(train_pca_corr, y)

# # Retrieving the best parameters
# grid_search.best_params_
# # min_samples_leaf: 5, min_samples_split:6, n_estimators=200

In [None]:
# rf = RandomForestRegressor(n_estimators=200, min_samples_leaf=5, min_samples_split=6)
# rmsle_rf = rmsle_cv(rf, train_pca_corr, y)
# np.mean(rmsle_rf), np.std(rmsle_rf)
# # Mean: 1.522, STD: 0.0391

In [None]:
# temp = pd.DataFrame(train_pca_corr)
# non_target_column = list(temp.columns)

# from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_estimators=200, min_samples_leaf=5, min_samples_split=6)
# rf.fit(temp,y)

# %matplotlib inline
# #do code to support model
# #"data" is the X dataframe and model is the SKlearn object

# feats = {}
# for feature, importance in zip(non_target_column, rf.feature_importances_):
#     feats[feature] = importance #add the name/value pair 

# import operator
# sorted_feats = sorted(feats.items(), key=operator.itemgetter(1), reverse=True)

## Ensembling

In [9]:
# Function that ensembles models (credit goes to Shivang: https://www.kaggle.com/lightsalsa/ensemble-of-lgbm-and-xgb)

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]  
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)
        return self

    def predict(self, X):
        predictions = np.column_stack([ model.predict(X) for model in self.models_ ])
        return np.mean(predictions, axis=1)

In [10]:
# Running an Ensemble

# Setting up models
rf = RandomForestRegressor(n_estimators=200, min_samples_leaf=5, min_samples_split=6)
ridge = Ridge(alpha=1, normalize=True)

# Running the ensemble
averaged_models = AveragingModels(models = (rf, ridge))

# Printing the results of ensemble
score = rmsle_cv(averaged_models, train_pca_corr, y)
print("averaged score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [11]:
averaged_models = AveragingModels(models = (rf, ridge))

In [12]:
score = rmsle_cv(averaged_models, train_pca_corr, y)
print("averaged score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

averaged score: 1.5722 (0.0434)



# Preparing for Submission

In [None]:
# test_id = test['ID']
# test.drop('ID', axis=1, inplace=True)
# test.drop(test[to_drop], axis=1, inplace=True)
# test_std = sc.transform(test)
# test_pca = pca_corr.transform(test_std)

In [None]:
# sub = pd.DataFrame()
# sub['ID'] = test_id
# results = np.expm1(model.predict(test_pca).astype('float128'))
# sub['target'] = results
# sub['target'].clip(lower=0, inplace=True)
# sub.to_csv('submission.csv',index=False)

In [None]:
# test_id = test['ID']
# test.drop('ID', axis=1, inplace=True)
# test.drop(test[to_drop], axis=1, inplace=True)
# test_std = sc.transform(test)
# test_pca = pca.transform(test_std)
# sub = pd.DataFrame()
# sub['ID'] = test_id
# sub['target'] = np.expm1(alg.predict(test_pca))
# sub.to_csv('submission.csv',index=False)

# Deep Learning

In [None]:
# DL Imports

import keras
import keras.backend as K
from keras.optimizers import Adam
from keras.layers import Input, Dropout, Dense
from keras.layers.normalization import BatchNormalization
from keras.models import Model

In [None]:
# RMSLE function for DL

def rmsle(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

In [None]:
input_layer = Input(shape=(X_train.shape[1],))
x = input_layer
x = Dense(512, activation='linear')(x)
x = Dropout(0.1)(x)
x = Dense(512, activation='linear')(x)
x = Dropout(0.1)(x)
x = Dense(64, activation='linear')(x)
x = Dense(1, activation='linear')(x)
dl = Model(inputs=input_layer, outputs=x, name='model1')
dl.compile(optimizer=Adam(lr=0.0001), loss='mean_squared_error', metrics=['mae', rmsle])

batch_size = 16
epochs = 100

history = dl.fit(X_train, y_train, 
          epochs=epochs,
          batch_size=batch_size,
          validation_data=(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt
mae = history.history['mean_absolute_error']
val_mae = history.history['val_mean_absolute_error']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(mae) + 1)
plt.plot(epochs, mae, 'bo', label='Training acc')
plt.plot(epochs, val_mae, 'b', label='Validation acc')
plt.title('Training and validation MAE')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Applying cross validation to a deep learning algorithm

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cvscores = []
for train, test in kfold.split(train_pca_corr, y):
  # create model
    input_layer = Input(shape=(train_pca_corr[train].shape[1],))
    x = input_layer
    x = Dense(512, activation='linear')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='linear')(x)
    x = Dropout(0.1)(x)
    x = Dense(64, activation='linear')(x)
    x = Dense(1, activation='linear')(x)
    model = Model(inputs=input_layer, outputs=x, name='model1')
    # Compile model
    model.compile(loss=rmsle, optimizer='adam', metrics=[rmsle])
    # Fit the model
    model.fit(train_pca_corr[train], y[train], epochs=100, batch_size=16, verbose=0)
    # evaluate the model
    scores = model.evaluate(train_pca_corr[test], y[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]))
    cvscores.append(scores[1])
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))