In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#RMSLE
import timeit
import math
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [3]:
# Make predictions fit for Kaggle
# No negatives
def fit_for_kaggle(y):
    for i in range(len(y)):
        y[i] = max(0,y[i])
# Only int
    y = y.astype(int)
    return y

In [4]:
# Exports in Kaggle format
def Export_for_Kaggle(y_pred, path):
# path is the name of the file "x.csv"
    data_out = pd.DataFrame(y_pred, columns = ['Prediction'])
    data_out.index.name = 'Id'
    data_out.to_csv(path, sep = ",")
    
    return

In [5]:
#import all data
feature_data = pd.read_csv('kaggle_data/features.txt', header=None, sep="  ", names=['feature_names', 'feature_description'], engine='python')
list_feature_names = list(feature_data['feature_names'])
train_data = pd.read_csv('kaggle_data/train.csv', header=None, sep=" ", names=list_feature_names)
test_data=pd.read_csv('kaggle_data/test-val.csv',header=None,sep=" ",names=list_feature_names)
target_data = pd.read_csv('kaggle_data/train-targets.csv', sep=",")

In [6]:
#Data engineering 

# 'weekday' encoding
weekday_data = pd.get_dummies(train_data['weekday'],prefix='weekday',drop_first=True)

# 'category' encoding
category_data = pd.get_dummies(train_data['category'],prefix='category',drop_first=True)

#fusion that shit
other_data = train_data.drop(['weekday','category'],axis=1)
training_data = pd.concat([category_data,weekday_data,other_data],axis=1)

In [7]:
#lets do the same with the test data
weekday_data_test=pd.get_dummies(test_data['weekday'],prefix='weekday',drop_first=True)
category_data_test=pd.get_dummies(test_data['category'],prefix='category',drop_first=True)
other_data_test=test_data.drop(['weekday','category'],axis=1)
testing_data=pd.concat([category_data_test,weekday_data_test,other_data_test],axis=1)

#see it
#testing_data.head(5)


In [8]:
#Data standardization
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(training_data)
X_reg = scaler.transform(training_data)
y_reg = target_data['Prediction'].values

#scale that test data
X_test = scaler.transform(testing_data)

#visualize X_reg
#print X_reg

In [9]:
# Making the kfold
from sklearn import model_selection
kf = model_selection.KFold(n_splits = 10)
k_folds = kf.split(X_reg, y_reg)

In [10]:
# Setting up the Linear regression
from sklearn import linear_model
lin_reg = linear_model.LinearRegression()

In [11]:
# Predictions by Cross validation
y_pred = model_selection.cross_val_predict(lin_reg, X_reg, y_reg, cv=k_folds)
# num shares can't be negative, so 0 instead
fit_for_kaggle(y_pred)

array([2867, 2237, 4349, ..., 3674, 6793, 1897])

In [12]:
# Let's see the error
from sklearn import metrics
print("Mean squared error: %.3f" % metrics.mean_squared_error(y_reg, y_pred))

Mean squared error: 60429191.508


In [13]:
#RMSLE 
rmsle(y_reg, y_pred)

1.2177093989935257

In [14]:
# Prediction on Test set
lin_reg.fit(X_reg,y_reg)
y_test_reg = lin_reg.predict(X_test)
fit_for_kaggle(y_test_reg)
print(y_test_reg)

[ 3159.6075969   4992.74842606  1855.4661532  ...,  2680.83242045
  1919.90795272  3575.68864659]


In [15]:
#Expoeting
Export_for_Kaggle(y_test_reg, "solution_reg.csv")

In [19]:
#RIDGE
kf2 = model_selection.KFold(n_splits=5)
kf2.get_n_splits(X_reg)
folds_regr = [(tr, te) for (tr, te) in kf2.split(X_reg)]
param_grid = {'alpha': np.logspace(-3, 3, 6)}
#regr_ridge_opt = model_selection.GridSearchCV(linear_model.Ridge(), param_grid, cv=folds_regr, scoring='neg_mean_squared_log_error')
regr_ridge_opt = model_selection.GridSearchCV(linear_model.Ridge(), param_grid)
regr_ridge_opt.fit(X_reg, y_reg)
ypred_ridge_opt = regr_ridge_opt.predict(X_reg)
fit_for_kaggle(ypred_ridge_opt)
rmsle(y_reg, ypred_ridge_opt)
#print('param=', regr_ridge_opt.best_params_, 'RMSE=', np.sqrt(-1*regr_ridge_opt.best_score_))

1.055437278734848

In [20]:
#Ridge for test
y_test_ridge = regr_ridge_opt.predict(X_test)
y_test_ridge = fit_for_kaggle(y_test_ridge)
Export_for_Kaggle(y_test_ridge, "solution_ridge.csv")

In [22]:
# Lasso
regr_lasso = model_selection.GridSearchCV(linear_model.Lasso(), param_grid, cv=folds_regr)
regr_lasso.fit(X_reg, y_reg)
ypred_lasso = regr_lasso.predict(X_test)
ypred_lasso = fit_for_kaggle(ypred_lasso)
Export_for_Kaggle(ypred_lasso, "solution_lasso.csv")

In [32]:
#tSNE
from sklearn import manifold
#let's make a copy of X_reg
X_reg_1 = X_reg
#let's declare it
t_SNE = manifold.TSNE(n_components = 3)
X_reg_tsne = t_SNE.fit_transform(X_reg_1, y_reg)
X_test_tsne = t_SNE.fit_transform(X_test)

KeyboardInterrupt: 

In [71]:
#PCA
from sklearn import decomposition
ACP = decomposition.PCA(n_components = 40)
ACP.fit(X_reg)
X_test_acp = ACP.transform(X_test)
X_reg_acp = ACP.transform(X_reg)


In [28]:
#KNN
#n_neighbors = 5
#from sklearn import neighbors
#KNN = neighbors.KNeighborsRegressor(n_neighbors = n_neighbors)
#KNN.fit(X_reg_tsne, y_reg)
#y_reg_knn = KNN.predict(X_reg_tsne)
#y_reg_knn = fit_for_kaggle(y_reg_knn)
#print rmsle(y_reg, y_reg_knn)

0.868354510229


In [30]:
#KNN for test
#y_test_knn = KNN.predict(X_test_tsne)
#y_test_knn = fit_for_kaggle(y_test_knn)
#Export_for_Kaggle(y_test_knn, "solution_knn-tsne.csv")


In [79]:
#KNN with acp
KNN = neighbors.KNeighborsRegressor(n_neighbors = 3)
KNN.fit(X_reg_acp, y_reg)
y_reg_knn_acp = KNN.predict(X_reg_acp)
y_reg_knn_acp = fit_for_kaggle(y_reg_knn_acp)
print rmsle(y_reg, y_reg_knn_acp)
y_test_knn_acp = KNN.predict(X_test_acp)
y_test_knn_acp = fit_for_kaggle(y_test_knn_acp)
Export_for_Kaggle(y_test_knn_acp, "solution_knn-acp.csv")

0.725257608539
