In [None]:
!pip install vecstack

from vecstack import stacking
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Collecting vecstack
  Downloading vecstack-0.4.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19863 sha256=13f7dca180e9e1264b7f48e78496af45b45a4bf8d838465db05db5da04831f89
  Stored in directory: /root/.cache/pip/wheels/b8/d8/51/3cf39adf22c522b0a91dc2208db4e9de4d2d9d171683596220
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0
Mounted at /gdrive
/gdrive


In [None]:
pd.set_option('display.max_columns',None)#displaying long list of columns
pd.set_option('display.max_rows', None)#displaying long list of rows
pd.set_option('display.width', 1000)#width of window

In [None]:
trainfile = r'/gdrive/My Drive/CIS_508/train_rev.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS_508/test_rev.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)



(137, 43)
(100000, 42)


In [None]:
#Extract training and test data
Ytrain = trainData["revenue"]
Xtrain = trainData.drop(["revenue"], axis=1) #extracting training data without the target column
Xtest = testData.copy() #extracting training data without the target column
print(Xtrain.shape)
print(Xtest.shape)

(137, 42)
(100000, 42)


In [None]:
categoricalFeatures = ['Open Date','City','City Group','Type']

In [None]:
# OneHotEncoding on Train (fit & transform)
# OneHotEncoding is to be done on Categorical variables.
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
Xcat = pd.DataFrame(ohe.fit_transform(Xtrain[categoricalFeatures]),columns=ohe.get_feature_names_out(),index=Xtrain.index)
Xtrain = pd.concat([Xtrain,Xcat],axis=1)
Xtrain.drop(labels=categoricalFeatures,axis=1,inplace=True)
Xtrain.sample(5)

Unnamed: 0,Id,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19,P20,P21,P22,P23,P24,P25,P26,P27,P28,P29,P30,P31,P32,P33,P34,P35,P36,P37,Open Date_01/03/2014,Open Date_01/07/2000,Open Date_01/09/2010,Open Date_01/17/2009,Open Date_01/18/2011,Open Date_01/22/2007,Open Date_01/25/2010,Open Date_01/25/2014,Open Date_01/26/2009,Open Date_01/28/1998,Open Date_02/02/2012,Open Date_02/07/2012,Open Date_02/08/2007,Open Date_02/09/2011,Open Date_02/11/2008,Open Date_02/12/2010,Open Date_02/14/2008,Open Date_02/23/2010,Open Date_02/28/2013,Open Date_03/01/2011,Open Date_03/09/2013,Open Date_03/11/2011,Open Date_03/14/1998,Open Date_03/15/2008,Open Date_03/16/2010,Open Date_03/16/2013,Open Date_03/21/2012,Open Date_03/28/2009,Open Date_03/29/2013,Open Date_03/30/2012,Open Date_04/01/2008,Open Date_04/10/1997,Open Date_04/11/2012,Open Date_04/21/2012,Open Date_04/23/2013,Open Date_05/01/2010,Open Date_05/01/2011,Open Date_05/01/2012,Open Date_05/04/2012,Open Date_05/08/1996,Open Date_05/09/2008,Open Date_05/09/2009,Open Date_05/11/2009,Open Date_05/22/2012,Open Date_05/30/2008,Open Date_06/01/2009,Open Date_06/03/2009,Open Date_06/05/1999,Open Date_06/08/2012,Open Date_06/15/2008,Open Date_06/20/2007,Open Date_06/21/2008,Open Date_06/21/2011,Open Date_06/25/2008,Open Date_07/01/2006,Open Date_07/08/2006,Open Date_07/09/2012,Open Date_07/10/2013,Open Date_07/13/1998,Open Date_07/17/1999,Open Date_07/20/2008,Open Date_07/26/2011,Open Date_08/05/2009,Open Date_08/05/2013,Open Date_08/08/2011,Open Date_08/08/2013,Open Date_08/09/2013,Open Date_08/10/2012,Open Date_08/12/1998,Open Date_08/12/2008,Open Date_08/16/2011,Open Date_08/18/2005,Open Date_08/18/2011,Open Date_08/23/2010,Open Date_08/25/2007,Open Date_08/25/2010,Open Date_08/25/2011,Open Date_08/28/2010,Open Date_08/30/2011,Open Date_09/01/2009,Open Date_09/01/2010,Open Date_09/07/2007,Open Date_09/09/2012,Open Date_09/11/2004,Open Date_09/20/2009,Open Date_09/21/2007,Open Date_09/24/2009,Open Date_09/26/2012,Open Date_09/27/2011,Open Date_09/29/2012,Open Date_10/04/2011,Open Date_10/09/1999,Open Date_10/09/2009,Open Date_10/11/2010,Open Date_10/12/2006,Open Date_10/13/2004,Open Date_10/13/2006,Open Date_10/14/2011,Open Date_10/14/2012,Open Date_10/15/2005,Open Date_10/16/2009,Open Date_10/16/2010,Open Date_10/25/2013,Open Date_10/29/2010,Open Date_10/29/2011,Open Date_11/01/2002,Open Date_11/05/2011,Open Date_11/06/2002,Open Date_11/08/2009,Open Date_11/08/2011,Open Date_11/12/2013,Open Date_11/13/2004,Open Date_11/15/2010,Open Date_11/16/2011,Open Date_11/25/2008,Open Date_11/27/2006,Open Date_11/27/2011,Open Date_12/01/2007,Open Date_12/01/2008,Open Date_12/01/2011,Open Date_12/06/2006,Open Date_12/06/2008,Open Date_12/06/2011,Open Date_12/09/2006,Open Date_12/16/2005,Open Date_12/18/1999,Open Date_12/21/2011,Open Date_12/21/2013,Open Date_12/23/2009,Open Date_12/23/2011,Open Date_12/25/2009,Open Date_12/27/2005,Open Date_12/29/2011,Open Date_12/31/2012,City_Adana,City_Afyonkarahisar,City_Amasya,City_Ankara,City_Antalya,City_Aydın,City_Balıkesir,City_Bolu,City_Bursa,City_Denizli,City_Diyarbakır,City_Edirne,City_Elazığ,City_Eskişehir,City_Gaziantep,City_Isparta,City_Karabük,City_Kastamonu,City_Kayseri,City_Kocaeli,City_Konya,City_Kütahya,City_Kırklareli,City_Muğla,City_Osmaniye,City_Sakarya,City_Samsun,City_Tekirdağ,City_Tokat,City_Trabzon,City_Uşak,City_İstanbul,City_İzmir,City_Şanlıurfa,City Group_Big Cities,City Group_Other,Type_DT,Type_FC,Type_IL
77,77,3,5.0,3.0,5.0,3,4,5,4,4,4,5,4,4.0,0,0,0,0,0,5,5,1,3,1,0,0,0.0,0.0,3.0,2.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
108,108,1,2.0,5.0,4.0,1,2,1,5,4,4,2,4,3.0,0,0,0,0,0,1,1,1,1,1,0,0,0.0,0.0,2.0,3.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
129,129,4,5.0,5.0,4.0,2,2,5,4,5,5,3,5,5.0,0,0,0,0,0,5,5,4,5,5,0,0,0.0,0.0,4.0,1.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7,7,4,5.0,4.0,5.0,2,3,5,4,4,4,4,3,4.0,0,0,0,0,0,3,5,2,4,2,0,0,0.0,0.0,3.0,2.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
78,78,9,7.5,6.0,7.5,8,6,10,10,8,8,4,6,6.0,3,2,9,3,12,25,12,6,2,10,4,4,5.0,2.5,7.5,2.5,20,9,20,4,18,12,12,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
# OneHotEncoding on Test (only transform)
# OneHotEncoding is to be done on Categorical variables.
Xcat = pd.DataFrame(ohe.transform(Xtest[categoricalFeatures]),columns=ohe.get_feature_names_out(),index=Xtest.index)
Xtest = pd.concat([Xtest,Xcat],axis=1)
Xtest.drop(labels=categoricalFeatures,axis=1,inplace=True)
Xtest.sample(5)

Unnamed: 0,Id,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19,P20,P21,P22,P23,P24,P25,P26,P27,P28,P29,P30,P31,P32,P33,P34,P35,P36,P37,Open Date_01/03/2014,Open Date_01/07/2000,Open Date_01/09/2010,Open Date_01/17/2009,Open Date_01/18/2011,Open Date_01/22/2007,Open Date_01/25/2010,Open Date_01/25/2014,Open Date_01/26/2009,Open Date_01/28/1998,Open Date_02/02/2012,Open Date_02/07/2012,Open Date_02/08/2007,Open Date_02/09/2011,Open Date_02/11/2008,Open Date_02/12/2010,Open Date_02/14/2008,Open Date_02/23/2010,Open Date_02/28/2013,Open Date_03/01/2011,Open Date_03/09/2013,Open Date_03/11/2011,Open Date_03/14/1998,Open Date_03/15/2008,Open Date_03/16/2010,Open Date_03/16/2013,Open Date_03/21/2012,Open Date_03/28/2009,Open Date_03/29/2013,Open Date_03/30/2012,Open Date_04/01/2008,Open Date_04/10/1997,Open Date_04/11/2012,Open Date_04/21/2012,Open Date_04/23/2013,Open Date_05/01/2010,Open Date_05/01/2011,Open Date_05/01/2012,Open Date_05/04/2012,Open Date_05/08/1996,Open Date_05/09/2008,Open Date_05/09/2009,Open Date_05/11/2009,Open Date_05/22/2012,Open Date_05/30/2008,Open Date_06/01/2009,Open Date_06/03/2009,Open Date_06/05/1999,Open Date_06/08/2012,Open Date_06/15/2008,Open Date_06/20/2007,Open Date_06/21/2008,Open Date_06/21/2011,Open Date_06/25/2008,Open Date_07/01/2006,Open Date_07/08/2006,Open Date_07/09/2012,Open Date_07/10/2013,Open Date_07/13/1998,Open Date_07/17/1999,Open Date_07/20/2008,Open Date_07/26/2011,Open Date_08/05/2009,Open Date_08/05/2013,Open Date_08/08/2011,Open Date_08/08/2013,Open Date_08/09/2013,Open Date_08/10/2012,Open Date_08/12/1998,Open Date_08/12/2008,Open Date_08/16/2011,Open Date_08/18/2005,Open Date_08/18/2011,Open Date_08/23/2010,Open Date_08/25/2007,Open Date_08/25/2010,Open Date_08/25/2011,Open Date_08/28/2010,Open Date_08/30/2011,Open Date_09/01/2009,Open Date_09/01/2010,Open Date_09/07/2007,Open Date_09/09/2012,Open Date_09/11/2004,Open Date_09/20/2009,Open Date_09/21/2007,Open Date_09/24/2009,Open Date_09/26/2012,Open Date_09/27/2011,Open Date_09/29/2012,Open Date_10/04/2011,Open Date_10/09/1999,Open Date_10/09/2009,Open Date_10/11/2010,Open Date_10/12/2006,Open Date_10/13/2004,Open Date_10/13/2006,Open Date_10/14/2011,Open Date_10/14/2012,Open Date_10/15/2005,Open Date_10/16/2009,Open Date_10/16/2010,Open Date_10/25/2013,Open Date_10/29/2010,Open Date_10/29/2011,Open Date_11/01/2002,Open Date_11/05/2011,Open Date_11/06/2002,Open Date_11/08/2009,Open Date_11/08/2011,Open Date_11/12/2013,Open Date_11/13/2004,Open Date_11/15/2010,Open Date_11/16/2011,Open Date_11/25/2008,Open Date_11/27/2006,Open Date_11/27/2011,Open Date_12/01/2007,Open Date_12/01/2008,Open Date_12/01/2011,Open Date_12/06/2006,Open Date_12/06/2008,Open Date_12/06/2011,Open Date_12/09/2006,Open Date_12/16/2005,Open Date_12/18/1999,Open Date_12/21/2011,Open Date_12/21/2013,Open Date_12/23/2009,Open Date_12/23/2011,Open Date_12/25/2009,Open Date_12/27/2005,Open Date_12/29/2011,Open Date_12/31/2012,City_Adana,City_Afyonkarahisar,City_Amasya,City_Ankara,City_Antalya,City_Aydın,City_Balıkesir,City_Bolu,City_Bursa,City_Denizli,City_Diyarbakır,City_Edirne,City_Elazığ,City_Eskişehir,City_Gaziantep,City_Isparta,City_Karabük,City_Kastamonu,City_Kayseri,City_Kocaeli,City_Konya,City_Kütahya,City_Kırklareli,City_Muğla,City_Osmaniye,City_Sakarya,City_Samsun,City_Tekirdağ,City_Tokat,City_Trabzon,City_Uşak,City_İstanbul,City_İzmir,City_Şanlıurfa,City Group_Big Cities,City Group_Other,Type_DT,Type_FC,Type_IL
71984,71984,3,3.0,4.0,4.0,2,3,5,4,4,5,1,4,5.0,0,0,0,0,0,2,5,2,1,2,0,0,0.0,0.0,1.0,3.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3661,3661,5,5.0,3.0,4.0,2,2,5,5,5,5,4,4,5.0,0,0,4,0,0,3,5,1,3,2,0,0,0.0,4.0,3.0,2.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
13703,13703,2,3.0,4.0,4.0,2,2,5,5,5,5,3,5,5.0,2,0,0,4,0,1,1,2,4,1,2,0,1.0,0.0,1.0,3.0,5,0,0,2,0,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4269,4269,4,4.0,3.0,4.0,1,2,5,5,5,5,2,5,5.0,0,0,0,0,0,5,1,2,1,1,0,0,0.0,0.0,2.0,3.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
18811,18811,3,4.0,4.0,3.0,1,3,5,5,5,4,2,5,4.0,3,2,2,4,3,2,5,3,2,1,4,5,5.0,4.0,4.0,3.0,5,5,5,5,4,0,4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [None]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          137 non-null    int64  
 1   Open Date   137 non-null    object 
 2   City        137 non-null    object 
 3   City Group  137 non-null    object 
 4   Type        137 non-null    object 
 5   P1          137 non-null    int64  
 6   P2          137 non-null    float64
 7   P3          137 non-null    float64
 8   P4          137 non-null    float64
 9   P5          137 non-null    int64  
 10  P6          137 non-null    int64  
 11  P7          137 non-null    int64  
 12  P8          137 non-null    int64  
 13  P9          137 non-null    int64  
 14  P10         137 non-null    int64  
 15  P11         137 non-null    int64  
 16  P12         137 non-null    int64  
 17  P13         137 non-null    float64
 18  P14         137 non-null    int64  
 19  P15         137 non-null    i

In [None]:
print(Xtrain.shape)
print(Xtest.shape)

(137, 211)
(100000, 211)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(Xtrain, Ytrain, test_size=0.7, random_state=1)
clf = DecisionTreeRegressor()
clf.fit(X_train, Y_train)
clf_predict_Train=clf.predict(X_train)

#clf.feature_importances_
mean_squared_error(Y_train,clf_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(Y_train,clf_predict_Train)))
clf_predict_Test=clf.predict(X_test)
mean_squared_error(Y_test,clf_predict_Test)
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(Y_test,clf_predict_Test)))

RMSE (training) for Decision Tree:  0.000000
RMSE (Test Data) for Decision Tree:8673299346556.073242


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

# Hyperparameter tuning for decision tree classifier
parameters = {'min_samples_split': range(10, 50, 10), 'max_depth': range(1, 20, 2)}
clf_random = RandomizedSearchCV(DecisionTreeClassifier(random_state=1), parameters, n_iter=5)
clf_random.fit(Xtrain, Ytrain)
grid_parm = clf_random.best_params_
print(grid_parm)

# Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
clf = DecisionTreeClassifier(**grid_parm)
clf.fit(X_train, Y_train)
clf_predict = clf.predict(X_test)

# Obtain accuracy, confusion matrix, classification report, and AUC values for the result above.
print("Accuracy Score (testing) after hyperparameter tuning for Decision Tree: {:.6f}".format(clf.score(X_test, Y_test)))
print("Confusion Matrix after hyperparameter tuning for Decision Tree")
print(confusion_matrix(Y_test, clf_predict))
print("=== Classification Report ===")
print(classification_report(Y_test, clf_predict))

# Get cross-validation report
clf_cv_score = cross_val_score(clf, Xtrain, Ytrain, cv=5, scoring="roc_auc")
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ", clf_cv_score.mean())

ValueError: ignored

In [None]:
#Save predictions
df_DT=pd.DataFrame()
df_DT['revenue']=clf_predict_Test
export_csv = df_DT.to_csv(r'/gdrive/My Drive/CIS_508/revenue_dt.csv')
df_DT.tail()

Unnamed: 0,revenue
91,5444227.0
92,1619683.0
93,3351383.0
94,6412623.0
95,4250553.0


In [None]:
#Random Forest Regressor==============================================================================
#=================================================================================================

rfc = RandomForestRegressor()
rfc.fit(X_train, Y_train)
rfc_predict_Train=rfc.predict(X_train)

mean_squared_error(Y_train,rfc_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(Y_train,rfc_predict_Train)))
rfc_predict_Test=rfc.predict(X_test)
mean_squared_error(Y_test,rfc_predict_Test)
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(Y_test,rfc_predict_Test)))



RMSE (training) for Decision Tree:314343159904.605774
RMSE (Test Data) for Decision Tree:8364027659845.656250


In [None]:
#Save predictions
df_rfc=pd.DataFrame()
df_rfc['revenue']=rfc_predict_Test
export_csv = df_rfc.to_csv(r'/gdrive/My Drive/CIS_508/RF_Test.csv')
df_rfc.tail()

Unnamed: 0,revenue
91,4250396.09
92,3183805.56
93,3038739.8
94,6242686.88
95,4529271.33


In [None]:
#Gradient Boosting Regressor================================================================================

abc =GradientBoostingRegressor()
abc.fit(X_train, Y_train)
abc_predict_Train=abc.predict(X_train)

mean_squared_error(Y_train,abc_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(Y_train,abc_predict_Train)))
abc_predict_Test=rfc.predict(X_test)
mean_squared_error(Y_test,abc_predict_Test)
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(Y_test,abc_predict_Test)))




RMSE (training) for Decision Tree:11710005010.038824
RMSE (Test Data) for Decision Tree:8364027659845.656250


In [None]:
#Save predictions
df_abc=pd.DataFrame()
df_abc['SalePrice']=abc_predict_Test
export_csv = df_abc.to_csv(r'/gdrive/My Drive/CIS_508/GB_Test.csv')
df_abc.tail()

Unnamed: 0,SalePrice
91,4250396.09
92,3183805.56
93,3038739.8
94,6242686.88
95,4529271.33


In [None]:
#STACKING MODELS =====================================================================
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [ GradientBoostingRegressor(), RandomForestRegressor(), DecisionTreeRegressor() ]

S_Train, S_Test = stacking(models,
                           X_train, Y_train, X_test,
                           regression=True,

                           mode='oof_pred_bag',

                           needs_proba=False,

                           save_dir=None,

                           shuffle=True,

                           n_folds=4,

                           random_state=0,

                           verbose=2)



___________________________________________________________________________________________
Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [GradientBoostingRegressor]
    fold  0:  [1624565.84171260]
    fold  1:  [1781945.72472926]
    fold  2:  [1699979.38630779]
    fold  3:  [1386047.72854547]
    ----
    MEAN:     [1623134.67032378] + [147765.29327736]
    FULL:     [1623134.67032378]

model  1:     [RandomForestRegressor]
    fold  0:  [1470128.36411765]
    fold  1:  [1927939.60882353]
    fold  2:  [1912497.86764706]
    fold  3:  [1267347.28823529]
    ----
    MEAN:     [1644478.28220588] + [284960.74397122]
    FULL:     [1644478.28220588]

model  2:     [DecisionTreeRegressor]
    fold  0:  [2194240.47058824]
    fold  1:  [2133684.52941176]
    fold  2:  [1885716.29411765]
    fold  3:  [1213572.00000

In [None]:
#STACKING - CONTRUCT A GRADIENT BOOSTING MODEL==============================
model = GradientBoostingRegressor()

model = model.fit(S_Train, Y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)



In [None]:
mean_squared_error(Y_train,y_pred_train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(Y_train,y_pred_train)))
mean_squared_error(Y_test,y_pred_test)
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(Y_test,y_pred_test)))

RMSE (training) for Decision Tree:203277593632.605225
RMSE (Test Data) for Decision Tree:7695874306671.175781


In [None]:
#Save predictions
df_stacking=pd.DataFrame()
result = pd.concat([testData['Id'], pd.DataFrame(y_pred_test, columns=['SalePrice'])], axis=1)
export_csv = result.to_csv(r'/gdrive/My Drive/CIS_508/Stacking_Test.csv')