In [13]:
import sys
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')
# Importing the dataset
df = pd.read_csv("./Data/carsPolovni.csv")

In [14]:
# Always good to set a seed for reproducibility
SEED = 7
np.random.seed(SEED)

Skewness is a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.

Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails, or outliers. Data sets with low kurtosis tend to have light tails, or lack of outliers. A uniform distribution would be the extreme case

In [15]:
from scipy.stats import skew
from scipy.stats import kurtosis
def plotBarCat(df,feature,target):
    
    
    
    x0 = df[df[target]==0][feature]
    x1 = df[df[target]==1][feature]

    trace1 = go.Histogram(
        x=x0,
        opacity=0.75
    )
    trace2 = go.Histogram(
        x=x1,
        opacity=0.75
    )

    data = [trace1, trace2]
    layout = go.Layout(barmode='overlay',
                      title=feature,
                       yaxis=dict(title='Count'
        ))
    fig = go.Figure(data=data, layout=layout)

    #py.iplot(fig, filename='overlaid histogram')
    
    def DescribeFloatSkewKurt(df,target):
        """
            A fundamental task in many statistical analyses is to characterize
            the location and variability of a data set. A further
            characterization of the data includes skewness and kurtosis.
            Skewness is a measure of symmetry, or more precisely, the lack
            of symmetry. A distribution, or data set, is symmetric if it
            looks the same to the left and right of the center point.
            Kurtosis is a measure of whether the data are heavy-tailed
            or light-tailed relative to a normal distribution. That is,
            data sets with high kurtosis tend to have heavy tails, or
            outliers. Data sets with low kurtosis tend to have light
            tails, or lack of outliers. A uniform distribution would
            be the extreme case
        """
        print('-*-'*25)
        print("{0} mean : ".format(target), np.mean(df[target]))
        print("{0} var  : ".format(target), np.var(df[target]))
        print("{0} skew : ".format(target), skew(df[target]))
        print("{0} kurt : ".format(target), kurtosis(df[target]))
        print('-*-'*25)
    
    DescribeFloatSkewKurt(df,target)

In [16]:
df_name = df.columns

In [17]:
plotBarCat(df,df_name[0],'Cena')

-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Cena mean :  4594.449004763525
Cena var  :  12532001.141921349
Cena skew :  1.734032038622555
Cena kurt :  2.96128726661415
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-


In [18]:
def OutLiersBox(df,nameOfFeature):
    
    trace0 = go.Box(
        y = df[nameOfFeature],
        name = "All Points",
        jitter = 0.3,
        pointpos = -1.8,
        boxpoints = 'all',
        marker = dict(
            color = 'rgb(7,40,89)'),
        line = dict(
            color = 'rgb(7,40,89)')
    )

    trace1 = go.Box(
        y = df[nameOfFeature],
        name = "Only Whiskers",
        boxpoints = False,
        marker = dict(
            color = 'rgb(9,56,125)'),
        line = dict(
            color = 'rgb(9,56,125)')
    )

    trace2 = go.Box(
        y = df[nameOfFeature],
        name = "Suspected Outliers",
        boxpoints = 'suspectedoutliers',
        marker = dict(
            color = 'rgb(8,81,156)',
            outliercolor = 'rgba(219, 64, 82, 0.6)',
            line = dict(
                outliercolor = 'rgba(219, 64, 82, 0.6)',
                outlierwidth = 2)),
        line = dict(
            color = 'rgb(8,81,156)')
    )

    trace3 = go.Box(
        y = df[nameOfFeature],
        name = "Whiskers and Outliers",
        boxpoints = 'outliers',
        marker = dict(
            color = 'rgb(107,174,214)'),
        line = dict(
            color = 'rgb(107,174,214)')
    )

    data = [trace0,trace1,trace2,trace3]

    layout = go.Layout(
        title = "{} Outliers".format(nameOfFeature)
    )

    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig, filename = "Outliers")

In [19]:
#OutLiersBox(df,df_name[5])

In [20]:
#OutLiersBox(df,df_name[6])

In [21]:
#OutLiersBox(df,df_name[8])

In [22]:
from pandas import set_option
#from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [11]:
df = pd.get_dummies(df)

In [12]:
X =  df.drop(columns='Cena')
Y = df[df_name[1]]
X_train, X_test, y_train, y_test =train_test_split(X,Y,
                                                   test_size=0.25,
                                                   random_state=0)

In [13]:
X

Unnamed: 0,Godiste,Kilometraza,Kubikaza,Snaga,Brend_ALFA ROMEO,Brend_AUDI,Brend_BMW,Brend_CITROEN,Brend_FIAT,Brend_FORD,...,Model_STILO,Model_SUPERB,Model_TIGUAN,Model_TOUAREG,Model_TOURAN,Model_V40,Model_WAGON R+,Model_XSARA,Model_XSARA PICASSO,Model_ZAFIRA
0,2007,215000,1.9,120,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2006,222000,1.9,150,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2004,178000,1.9,116,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2004,156906,1.9,116,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2002,272000,1.9,116,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23507,2015,129000,2.0,150,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23508,2017,108000,2.0,190,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23509,2015,129000,2.0,190,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23510,2012,180000,2.0,170,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [14]:
Y

0         2150
1         2850
2         1850
3         1700
4         1700
         ...  
23507    13490
23508    19900
23509    15000
23510     9000
23511     6650
Name: Cena, Length: 23512, dtype: int64

In [15]:
# Spot-Check Algorithms
def GetBasedModel():
    basedModels = []
    basedModels.append(('XG'   , XGBRegressor()))
    basedModels.append(('LGBM'  , LGBMRegressor()))
    basedModels.append(('ETR'  , ExtraTreesRegressor()))
    basedModels.append(('RFR' , RandomForestRegressor()))
    basedModels.append(('ABR'   , AdaBoostRegressor()))
    basedModels.append(('GBR'  , GradientBoostingRegressor()))

    
    return basedModels

In [16]:
def BasedLine2(X_train, y_train,models, scoring):
    
    # Test options and evaluation metric
    num_folds = 10

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [23]:
class PlotBoxR(object):
    
    
    def __Trace(self,nameOfFeature,value): 
    
        trace = go.Box(
            y=value,
            name = nameOfFeature,
            marker = dict(
                color = 'rgb(0, 128, 128)',
            )
        )
        return trace

    def PlotResult(self,names,results):
        
        data = []

        for i in range(len(names)):
            data.append(self.__Trace(names[i],results[i]))


        py.iplot(data)

In [18]:
models = GetBasedModel()

In [19]:
names,results = BasedLine2(X_train, y_train,models, 'neg_mean_squared_error')

XG: -652426.985571 (35870.152775)
LGBM: -646190.550203 (60394.882235)
ETR: -818736.043826 (46406.541426)
RFR: -718174.456513 (42282.169846)
ABR: -2486636.572775 (103387.763336)
GBR: -817269.517773 (40346.585224)


In [22]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [23]:
names_r,results_r = BasedLine2(X_train, y_train,models, 'neg_mean_absolute_error')

XG: -539.406513 (10.131464)
LGBM: -546.657055 (8.760990)
ETR: -612.556805 (13.116961)
RFR: -571.067183 (15.933351)
ABR: -1324.429545 (58.385977)
GBR: -624.381101 (12.795323)


In [24]:
def ScoreDataFrame(names,results):
    def floatingDecimals(f_val, dec=3):
        prc = "{:."+str(dec)+"f}" 
    
        return float(prc.format(f_val))

    scores = []
    for r in results:
        scores.append(floatingDecimals(r.mean(),4))

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': scores})
    return scoreDataFrame

In [25]:
basedLineScore = ScoreDataFrame(names_r,results_r)
basedLineScore

Unnamed: 0,Model,Score
0,XG,-539.4065
1,LGBM,-546.6571
2,ETR,-612.5568
3,RFR,-571.0672
4,ABR,-1324.4295
5,GBR,-624.3811


In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


def GetScaledModel(nameOfScaler):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()

    pipelines = []
    pipelines.append((nameOfScaler+'XG'  , Pipeline([('Scaler', scaler),('XG'  , XGBRegressor())])))
    pipelines.append((nameOfScaler+'LGBM' , Pipeline([('Scaler', scaler),('LGBM' , LGBMRegressor())])))
    pipelines.append((nameOfScaler+'ETR' , Pipeline([('Scaler', scaler),('ETR' , ExtraTreesRegressor())])))
    pipelines.append((nameOfScaler+'RFR', Pipeline([('Scaler', scaler),('RFR', RandomForestRegressor())])))
    pipelines.append((nameOfScaler+'ABR'  , Pipeline([('Scaler', scaler),('ABR'  , AdaBoostRegressor())])))
    pipelines.append((nameOfScaler+'GBR' , Pipeline([('Scaler', scaler),('GBR' , GradientBoostingRegressor())])))
    


    return pipelines 

In [27]:
models = GetScaledModel('standard')
names,results = BasedLine2(X_train, y_train,models, 'neg_mean_absolute_error')
PlotBoxR().PlotResult(names,results)


standardXG: -542.568493 (16.966903)
standardLGBM: -546.009603 (9.235461)
standardETR: -612.532282 (7.854193)
standardRFR: -572.008035 (9.377788)
standardABR: -1308.392131 (45.485922)
standardGBR: -624.757101 (15.277783)


In [28]:
scaledScoreStandard = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard], axis=1)
compareModels

Unnamed: 0,Model,Score,Model.1,Score.1
0,XG,-539.4065,standardXG,-542.5685
1,LGBM,-546.6571,standardLGBM,-546.0096
2,ETR,-612.5568,standardETR,-612.5323
3,RFR,-571.0672,standardRFR,-572.008
4,ABR,-1324.4295,standardABR,-1308.3921
5,GBR,-624.3811,standardGBR,-624.7571


In [29]:
models = GetScaledModel('minmax')
names,results = BasedLine2(X_train, y_train,models,'neg_mean_absolute_error')
PlotBoxR().PlotResult(names,results)

scaledScoreMinMax = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard,
                          scaledScoreMinMax], axis=1)
compareModels

minmaxXG: -542.810672 (9.343428)
minmaxLGBM: -546.514169 (12.633276)
minmaxETR: -612.252211 (9.109143)
minmaxRFR: -568.809413 (5.137727)
minmaxABR: -1340.172284 (38.724921)
minmaxGBR: -625.028089 (9.134012)


Unnamed: 0,Model,Score,Model.1,Score.1,Model.2,Score.2
0,XG,-539.4065,standardXG,-542.5685,minmaxXG,-542.8107
1,LGBM,-546.6571,standardLGBM,-546.0096,minmaxLGBM,-546.5142
2,ETR,-612.5568,standardETR,-612.5323,minmaxETR,-612.2522
3,RFR,-571.0672,standardRFR,-572.008,minmaxRFR,-568.8094
4,ABR,-1324.4295,standardABR,-1308.3921,minmaxABR,-1340.1723
5,GBR,-624.3811,standardGBR,-624.7571,minmaxGBR,-625.0281


In [30]:
df_t = df.copy()
df_t_name = df_t.columns


In [24]:
def TurkyOutliers(df_out,nameOfFeature,drop=False):

    valueOfFeature = df_out[nameOfFeature]
    # Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(valueOfFeature, 25.)

    # Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(valueOfFeature, 75.)

    # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = (Q3-Q1)*1.5
    # print "Outlier step:", step
    outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].index.tolist()
    feature_outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].values
    # df[~((df[nameOfFeature] >= Q1 - step) & (df[nameOfFeature] <= Q3 + step))]


    # Remove the outliers, if any were specified
    print ("Number of outliers (inc duplicates): {} and outliers: {}".format(len(outliers), feature_outliers))
    if drop:
        good_data = df_out.drop(df_out.index[outliers]).reset_index(drop = True)
        print ("New dataset with removed outliers has {} samples with {} features each.".format(*good_data.shape))
        return good_data
    else: 
        print ("Nothing happens, df.shape = ",df_out.shape)
        return df_out

In [25]:
df_t_name

NameError: name 'df_t_name' is not defined

In [26]:
df_name[5]

'Kilometraza'

In [27]:
#feature_number = 5
#OutLiersBox(df,df_name[feature_number])

In [28]:
#feature_number = 1
#OutLiersBox(df,df_name[feature_number])

In [30]:
df_t = df
feature_number = 5
df_clean = TurkyOutliers(df_t,df_name[feature_number],True)
OutLiersBox(df_clean,df_name[feature_number])

Number of outliers (inc duplicates): 474 and outliers: [26000 74000 30000 29000 24000 24000 30000 64800 71556 68000 61000 75800
 74245 76274 44150 64000 49800 80000 82467 62500 25000 60307 81000 60000
 33618 50400 46900 72000 74555 24000 48500 24000 73000 25000 24500 78500
 72000 48746 44000 28000 29010 82000 21032 73000 72500 23500 28000 28000
 25000 26000 25000 80000 57700 22500 49900 34595 78000 73497 23100 65000
 75856 54500 73694 78895 25689 22200 27100 75000 22000 21000 26200 27234
 22000 21040 82000 72000 66000 63292 30428 22222 56000 76347 21500 23000
 25000 20200 25000 75000 76000 53000 55000 43507 29460 67000 80000 80000
 27000 56425 76000 30850 39900 25000 21452 76300 82200 30000 28500 26000
 69000 76000 51999 78000 35175 57000 56000 20980 61700 53000 75000 32500
 80000 46000 20500 75000 29000 24000 78000 45726 57784 38345 37000 75000
 77200 62777 45000 63628 69980 26735 59000 45600 58573 57000 82177 82130
 50000 75000 67000 73000 41000 77000 80580 81000 29800 56036 22000 80

In [37]:
df_name[8]

'Snaga'

In [31]:
feature_number = 8
df_clean = TurkyOutliers(df_t,df_name[feature_number],True)
OutLiersBox(df_clean,df_name[feature_number])

Number of outliers (inc duplicates): 276 and outliers: [230 200 204 200 209 200 200 200 200 209 200 200 200 200 220 200 200 200
 200 200 200 200 200 200 209 200 211 245 245 239 200 211 200 204 220 200
 220 232 200 200 200 220 234 230 204 200 200 211 200 211 211 238 211 211
 245 239 238 238 239 245 245 224 224 245 204 245 227 245 232 239 239 224
 218 232 234 242 240 231 245 224 249 239 230 232 231 204 224 239 204 224
 245 232 224 232 250 223 224 224 245 224 204 238 223 238 223 245 224 245
 224 245 245 232 224 232 245 235 239 224 234 224 211 224 231 224 223 224
 245 204 245 209 224 224 204 224 204 238 231 239 224 245 245 204 235 232
 245 204 224 232 224 205 239 245 216 239 224 211 211 239 211 211 239 239
 245 239 211 211 238 211 219 200 200 204 201 201 201 201 201 200 204 234
 199 203 200 218 231 220 250 220 220 220 220 200 226 200 211 200 200 230
 200 211 222 209 224 200 200 200 200 200 200 231 200 200 200 200 239 239
 239 239 239 239 200 201 200 200 200 224 245 224 239 224 245 224 224 

In [32]:
print('df shape: {}, new df shape: {}, we lost {} rows, {}% of our data'.format(df.shape[0],df_clean.shape[0],
                                                              df.shape[0]-df_clean.shape[0],
                                                        (df.shape[0]-df_clean.shape[0])/df.shape[0]*100))

df shape: 23512, new df shape: 23236, we lost 276 rows, 1.1738686628104797% of our data


In [33]:
df.to_csv('./Data/usedCleaned20.csv', index=False)

In [40]:
df_clean_name = df_clean.columns
X_c =  df_clean.drop(columns='Cena')
Y_c = df_clean['Cena']
X_train_c, X_test_c, y_train_c, y_test_c =train_test_split(X_c,Y_c,
                                                   test_size=0.25,
                                                   random_state=0)

In [41]:
models = GetScaledModel('minmax')
names,results = BasedLine2(X_train_c, y_train_c,models, 'neg_mean_absolute_error')
PlotBoxR().PlotResult(names,results)

scaledScoreMinMax_c = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard,
                          scaledScoreMinMax,
                          scaledScoreMinMax_c], axis=1)
compareModels

minmaxXG: -530.819836 (10.459292)
minmaxLGBM: -540.030277 (15.676633)
minmaxETR: -606.277908 (10.571077)
minmaxRFR: -565.006287 (12.048479)
minmaxABR: -1342.252842 (67.479373)
minmaxGBR: -614.189355 (12.041165)


Unnamed: 0,Model,Score,Model.1,Score.1,Model.2,Score.2,Model.3,Score.3
0,XG,-539.4065,standardXG,-542.5685,minmaxXG,-542.8107,minmaxXG,-530.8198
1,LGBM,-546.6571,standardLGBM,-546.0096,minmaxLGBM,-546.5142,minmaxLGBM,-540.0303
2,ETR,-612.5568,standardETR,-612.5323,minmaxETR,-612.2522,minmaxETR,-606.2779
3,RFR,-571.0672,standardRFR,-572.008,minmaxRFR,-568.8094,minmaxRFR,-565.0063
4,ABR,-1324.4295,standardABR,-1308.3921,minmaxABR,-1340.1723,minmaxABR,-1342.2528
5,GBR,-624.3811,standardGBR,-624.7571,minmaxGBR,-625.0281,minmaxGBR,-614.1894


In [42]:
def HeatMap(df,x=True):
        correlations = df.corr()
        ## Create color map ranging between two colors
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        fig, ax = plt.subplots(figsize=(10, 10))
        fig = sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',square=True, linewidths=.5, annot=x, cbar_kws={"shrink": .75})
        fig.set_xticklabels(fig.get_xticklabels(), rotation = 90, fontsize = 10)
        fig.set_yticklabels(fig.get_yticklabels(), rotation = 0, fontsize = 10)
        plt.tight_layout()
        plt.show()

#HeatMap(df,x=True)

## Algorithm tuning

In [43]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

In [44]:
class RandomSearch(object):
    
    def __init__(self,X_train,y_train,model,hyperparameters):
        
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.hyperparameters = hyperparameters
        
    def RandomSearch(self):
        # Create randomized search 10-fold cross validation and 100 iterations
        cv = 10
        clf = RandomizedSearchCV(self.model,
                                 self.hyperparameters,
                                 random_state=1,
                                 n_iter=100,
                                 cv=cv,
                                 verbose=0,
                                 n_jobs=-1,
                                 )
        # Fit randomized search
        best_model = clf.fit(self.X_train, self.y_train)
        message = (best_model.best_score_, best_model.best_params_)
        print("Best: %f using %s" % (message))

        return best_model,best_model.best_params_
    
    def BestModelPridict(self,X_test):
        
        best_model,_ = self.RandomSearch()
        pred = best_model.predict(X_test)
        return pred

In [45]:
class GridSearch(object):
    
    def __init__(self,X_train,y_train,model,hyperparameters):
        
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.hyperparameters = hyperparameters
        
    def GridSearch(self):
        # Create randomized search 10-fold cross validation and 100 iterations
        cv = 10
        clf = GridSearchCV(self.model,
                                 self.hyperparameters,
                                 cv=cv,
                                 verbose=0,
                                 n_jobs=-1,
                                 )
        # Fit randomized search
        best_model = clf.fit(self.X_train, self.y_train)
        message = (best_model.best_score_, best_model.best_params_)
        print("Best: %f using %s" % (message))

        return best_model,best_model.best_params_
    
    def BestModelPridict(self,X_test):
        
        best_model,_ = self.GridSearch()
        pred = best_model.predict(X_test)
        return pred

In [46]:
# model
model = XGBRegressor()

# A parameter grid for XGBoost
hyperparameters = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [47]:
XGB_RandSearch = RandomSearch(X_train_c,y_train_c,model,hyperparameters)
Prediction_XGB = XGB_RandSearch.BestModelPridict(X_test_c)

Best: 0.948183 using {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 1, 'colsample_bytree': 1.0}


In [53]:
params = {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 1, 'colsample_bytree': 1.0}
model = XGBRegressor(**params)

model = model.fit(X_train_c, y_train_c)
model.predict(X_test_c)
import pickle 

with open('ml_model', 'wb') as files:
    pickle.dump(model, files)

In [54]:
X.shape[1]

117

In [179]:
from sklearn.preprocessing import OneHotEncoder
df_enc = pd.read_csv("./Data/carsCleaned.csv")

df_enc = pd.get_dummies(df_enc)
X =  df_enc.drop(columns='Cena')
Y = df_enc['Cena']


X_train, X_test, y_train, y_test =train_test_split(X,Y,
                                                   test_size=0.25,
                                                   random_state=0)


In [180]:
primer = {'Brend': ['CITROEN'], 'Godiste': [2002], 'Gorivo': ['Dizel '], 'Karoserija': ['Hecbek '], 'Kilometraza': [289300], 'Kubikaza': [2.0], 'Model': ['XSARA'], 'Snaga': [90]}
print(primer)

{'Brend': ['CITROEN'], 'Godiste': [2002], 'Gorivo': ['Dizel '], 'Karoserija': ['Hecbek '], 'Kilometraza': [289300], 'Kubikaza': [2.0], 'Model': ['XSARA'], 'Snaga': [90]}


In [182]:
df1 = pd.get_dummies(pd.DataFrame(primer))
dummies_frame = df_enc
df1 = df1.reindex(columns = X_train.columns, fill_value=0)
df1

Unnamed: 0,Godiste,Kilometraza,Kubikaza,Snaga,Brend_ALFA ROMEO,Brend_AUDI,Brend_BMW,Brend_CITROEN,Brend_FIAT,Brend_FORD,...,Model_STILO,Model_SUPERB,Model_TIGUAN,Model_TOUAREG,Model_TOURAN,Model_V40,Model_WAGON R+,Model_XSARA,Model_XSARA PICASSO,Model_ZAFIRA
0,2002,289300,2.0,90,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [176]:
dummies_frame

Unnamed: 0,Godiste,Kilometraza,Kubikaza,Snaga,Brend_ALFA ROMEO,Brend_AUDI,Brend_BMW,Brend_CITROEN,Brend_FIAT,Brend_FORD,...,Model_STILO,Model_SUPERB,Model_TIGUAN,Model_TOUAREG,Model_TOURAN,Model_V40,Model_WAGON R+,Model_XSARA,Model_XSARA PICASSO,Model_ZAFIRA
6465,2009,196000,1.4,80,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
457,2004,176853,2.5,150,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12249,2005,177000,1.6,97,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18009,2002,239000,2.0,116,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19253,2001,236000,1.8,150,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,2003,178000,2.0,90,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19648,2007,269000,2.0,140,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9845,2004,250000,2.0,111,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10799,2005,170000,1.8,114,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
primer = df1.iloc[0].values
primer = np.array(primer).reshape((1,-1))
print(primer.shape)
print(X_train.shape)
print(X_test.shape)

(1, 118)
(17747, 118)
(5916, 118)


In [183]:
params = {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 1, 'colsample_bytree': 1.0}
model = XGBRegressor(**params)

model = model.fit(X_train, y_train)
#model.predict(X_test)
import pickle 

with open('ml_model', 'wb') as files:
    pickle.dump(model, files)

In [188]:
model.predict(primer)

array([1296.2065], dtype=float32)

In [90]:
(pd.DataFrame(test_data.toarray())).shape

(5916, 6046)

In [None]:
def getEncoded(test_data,labelencoder_dict,onehotencoder_dict):
    test_encoded_x = None
    for i in range(0,test_data.shape[1]):
        label_encoder =  labelencoder_dict[i]
        feature = label_encoder.transform(test_data[:,i])
        feature = feature.reshape(test_data.shape[0], 1)
        onehot_encoder = onehotencoder_dict[i]
        feature = onehot_encoder.transform(feature)
        if test_encoded_x is None:
          test_encoded_x = feature
        else:
          test_encoded_x = np.concatenate((test_encoded_x, feature), axis=1)
  return test_encoded_x

In [50]:
# model
model = LGBMRegressor()
hyperparameters = {
    'num_leaves': [31, 127],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
    }

In [51]:
LGBM_RandSearch = RandomSearch(X_train_c,y_train_c,model,hyperparameters)
Prediction_LGBM = LGBM_RandSearch.BestModelPridict(X_test_c)

Best: 0.948821 using {'reg_alpha': 0.1, 'num_leaves': 127, 'min_data_in_leaf': 30, 'lambda_l2': 0, 'lambda_l1': 0}


In [67]:
# model
model = ExtraTreesRegressor()
hyperparameters = {
    'n_estimators': [10,50,100],
    'criterion': ['mae'],
    'max_depth': [2,8,16],
    'min_samples_split': [4,6],
    'min_samples_leaf': [1,2]
}

In [94]:
from sklearn.ensemble import RandomForestRegressor
def get_models():
    """Generate a library of base learners."""
    param = {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 1, 'colsample_bytree': 1.0}
    model1 = XGBRegressor(**param)

    param = {'reg_alpha': 0.1, 'num_leaves': 127, 'min_data_in_leaf': 30, 'lambda_l2': 0, 'lambda_l1': 0}
    model2 = LGBMRegressor(**param)
    
    model3 = RandomForestRegressor()

    model4 = ExtraTreesRegressor()

    models = {'XGM':model1, 'LGBM':model2, 'RF':model3,
              'ETR':model4
              }

    return models

In [72]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()

hyperparameters = {
    'n_estimators': [1000],
    'max_features': [8,10],
    'max_depth': [5,7,9],
    'subsample': [0.5],
    'learning_rate': [0.001, 0.01]
}

#GB_RandSearch = RandomSearch(X_train_c,y_train_c,model,hyperparameters)
#Prediction_GB = GB_RandSearch.BestModelPridict(X_test_c)
#params = {'subsample': 0.5, 'n_estimators': 1000, 'max_features': 10, 'max_depth': 9, 'learning_rate': 0.01}

Best: 0.949940 using {'subsample': 0.5, 'n_estimators': 1000, 'max_features': 10, 'max_depth': 9, 'learning_rate': 0.01}


In [96]:

base_learners = get_models()

meta_learner = GradientBoostingRegressor(
    n_estimators=1000,
    max_features=0.5,
    max_depth=9,
    subsample=0.5,
    learning_rate=0.01, 
    random_state=SEED
)

In [98]:
from mlens.ensemble import SuperLearner

# Instantiate the ensemble with 10 folds
sl = SuperLearner(
    folds=10,
    random_state=SEED,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(base_learners.values())) 
sl.add_meta(meta_learner)

# Train the ensemble
sl.fit(X_train_c, y_train_c)

# Predict the test set
p_sl = sl.predict(X_test_c)


Fitting 2 layers
Processing layer-1             done | 00:01:34
Processing layer-2             done | 00:00:14
Fit complete                        | 00:01:49

Predicting 2 layers
Processing layer-1             done | 00:00:05
Processing layer-2             done | 00:00:01
Predict complete                    | 00:00:06


In [99]:
print(r2_score(y_test_c,p_sl))

0.9488980064897264
