In [9]:
import sys
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')
# Importing the dataset
df = pd.read_csv("./Data/carsPolovni.csv")

In [26]:
# Always good to set a seed for reproducibility
SEED = 7
np.random.seed(SEED)

Skewness is a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.

Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails, or outliers. Data sets with low kurtosis tend to have light tails, or lack of outliers. A uniform distribution would be the extreme case

In [11]:
from scipy.stats import skew
from scipy.stats import kurtosis
def plotBarCat(df,feature,target):
    
    
    
    x0 = df[df[target]==0][feature]
    x1 = df[df[target]==1][feature]

    trace1 = go.Histogram(
        x=x0,
        opacity=0.75
    )
    trace2 = go.Histogram(
        x=x1,
        opacity=0.75
    )

    data = [trace1, trace2]
    layout = go.Layout(barmode='overlay',
                      title=feature,
                       yaxis=dict(title='Count'
        ))
    fig = go.Figure(data=data, layout=layout)

    #py.iplot(fig, filename='overlaid histogram')
    
    def DescribeFloatSkewKurt(df,target):
        """
            A fundamental task in many statistical analyses is to characterize
            the location and variability of a data set. A further
            characterization of the data includes skewness and kurtosis.
            Skewness is a measure of symmetry, or more precisely, the lack
            of symmetry. A distribution, or data set, is symmetric if it
            looks the same to the left and right of the center point.
            Kurtosis is a measure of whether the data are heavy-tailed
            or light-tailed relative to a normal distribution. That is,
            data sets with high kurtosis tend to have heavy tails, or
            outliers. Data sets with low kurtosis tend to have light
            tails, or lack of outliers. A uniform distribution would
            be the extreme case
        """
        print('-*-'*25)
        print("{0} mean : ".format(target), np.mean(df[target]))
        print("{0} var  : ".format(target), np.var(df[target]))
        print("{0} skew : ".format(target), skew(df[target]))
        print("{0} kurt : ".format(target), kurtosis(df[target]))
        print('-*-'*25)
    
    DescribeFloatSkewKurt(df,target)

In [12]:
df_name = df.columns

In [13]:
plotBarCat(df,df_name[0],'Cena')

-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Cena mean :  4594.449004763525
Cena var  :  12532001.141921349
Cena skew :  1.734032038622555
Cena kurt :  2.96128726661415
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-


In [14]:
def OutLiersBox(df,nameOfFeature):
    
    trace0 = go.Box(
        y = df[nameOfFeature],
        name = "All Points",
        jitter = 0.3,
        pointpos = -1.8,
        boxpoints = 'all',
        marker = dict(
            color = 'rgb(7,40,89)'),
        line = dict(
            color = 'rgb(7,40,89)')
    )

    trace1 = go.Box(
        y = df[nameOfFeature],
        name = "Only Whiskers",
        boxpoints = False,
        marker = dict(
            color = 'rgb(9,56,125)'),
        line = dict(
            color = 'rgb(9,56,125)')
    )

    trace2 = go.Box(
        y = df[nameOfFeature],
        name = "Suspected Outliers",
        boxpoints = 'suspectedoutliers',
        marker = dict(
            color = 'rgb(8,81,156)',
            outliercolor = 'rgba(219, 64, 82, 0.6)',
            line = dict(
                outliercolor = 'rgba(219, 64, 82, 0.6)',
                outlierwidth = 2)),
        line = dict(
            color = 'rgb(8,81,156)')
    )

    trace3 = go.Box(
        y = df[nameOfFeature],
        name = "Whiskers and Outliers",
        boxpoints = 'outliers',
        marker = dict(
            color = 'rgb(107,174,214)'),
        line = dict(
            color = 'rgb(107,174,214)')
    )

    data = [trace0,trace1,trace2,trace3]

    layout = go.Layout(
        title = "{} Outliers".format(nameOfFeature)
    )

    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig, filename = "Outliers")

In [15]:
#OutLiersBox(df,df_name[5])

In [16]:
#OutLiersBox(df,df_name[6])

In [17]:
#OutLiersBox(df,df_name[8])

In [21]:
from pandas import set_option
#from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [37]:
df = pd.get_dummies(df)

In [38]:
X =  df.drop(columns='Cena')
Y = df[df_name[1]]
X_train, X_test, y_train, y_test =train_test_split(X,Y,
                                                   test_size=0.25,
                                                   random_state=0)

In [39]:
X

Unnamed: 0,Godiste,Kilometraza,Kubikaza,Snaga,Brend_ALFA ROMEO,Brend_AUDI,Brend_BMW,Brend_CITROEN,Brend_FIAT,Brend_FORD,...,Model_STILO,Model_SUPERB,Model_TIGUAN,Model_TOUAREG,Model_TOURAN,Model_V40,Model_WAGON R+,Model_XSARA,Model_XSARA PICASSO,Model_ZAFIRA
0,2007,215000,1.9,120,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2006,222000,1.9,150,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2004,178000,1.9,116,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2004,156906,1.9,116,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2002,272000,1.9,116,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23507,2015,129000,2.0,150,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23508,2017,108000,2.0,190,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23509,2015,129000,2.0,190,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23510,2012,180000,2.0,170,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [41]:
Y

0         2150
1         2850
2         1850
3         1700
4         1700
         ...  
23507    13490
23508    19900
23509    15000
23510     9000
23511     6650
Name: Cena, Length: 23512, dtype: int64

In [42]:
# Spot-Check Algorithms
def GetBasedModel():
    basedModels = []
    basedModels.append(('XG'   , XGBRegressor()))
    basedModels.append(('LGBM'  , LGBMRegressor()))
    basedModels.append(('ETR'  , ExtraTreesRegressor()))
    basedModels.append(('RFR' , RandomForestRegressor()))
    basedModels.append(('ABR'   , AdaBoostRegressor()))
    basedModels.append(('GBR'  , GradientBoostingRegressor()))

    
    return basedModels

In [46]:
def BasedLine2(X_train, y_train,models, scoring):
    
    # Test options and evaluation metric
    num_folds = 10

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [57]:
class PlotBoxR(object):
    
    
    def __Trace(self,nameOfFeature,value): 
    
        trace = go.Box(
            y=value,
            name = nameOfFeature,
            marker = dict(
                color = 'rgb(0, 128, 128)',
            )
        )
        return trace

    def PlotResult(self,names,results):
        
        data = []

        for i in range(len(names)):
            data.append(self.__Trace(names[i],results[i]))


        py.iplot(data)

In [44]:
models = GetBasedModel()

In [45]:
names,results = BasedLine2(X_train, y_train,models, 'neg_mean_squared_error')

XG: -652125.552293 (42612.492864)
LGBM: -655886.604479 (62788.095840)
ETR: -818608.448118 (49295.314984)
RFR: -720809.640696 (33103.399864)
ABR: -2517839.860924 (125026.472174)
GBR: -818881.321312 (26602.311308)


In [50]:
names_r,results_r = BasedLine2(X_train, y_train,models, 'r2')

XG: 0.948382 (0.002984)
LGBM: 0.948268 (0.002696)
ETR: 0.935213 (0.003377)
RFR: 0.942145 (0.004302)
ABR: 0.799278 (0.015743)
GBR: 0.934655 (0.002630)


In [51]:
def ScoreDataFrame(names,results):
    def floatingDecimals(f_val, dec=3):
        prc = "{:."+str(dec)+"f}" 
    
        return float(prc.format(f_val))

    scores = []
    for r in results:
        scores.append(floatingDecimals(r.mean(),4))

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': scores})
    return scoreDataFrame

In [52]:
basedLineScore = ScoreDataFrame(names_r,results_r)
basedLineScore

Unnamed: 0,Model,Score
0,XG,0.9484
1,LGBM,0.9483
2,ETR,0.9352
3,RFR,0.9421
4,ABR,0.7993
5,GBR,0.9347


In [53]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


def GetScaledModel(nameOfScaler):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()

    pipelines = []
    pipelines.append((nameOfScaler+'XG'  , Pipeline([('Scaler', scaler),('XG'  , XGBRegressor())])))
    pipelines.append((nameOfScaler+'LGBM' , Pipeline([('Scaler', scaler),('LGBM' , LGBMRegressor())])))
    pipelines.append((nameOfScaler+'ETR' , Pipeline([('Scaler', scaler),('ETR' , ExtraTreesRegressor())])))
    pipelines.append((nameOfScaler+'RFR', Pipeline([('Scaler', scaler),('RFR', RandomForestRegressor())])))
    pipelines.append((nameOfScaler+'ABR'  , Pipeline([('Scaler', scaler),('ABR'  , AdaBoostRegressor())])))
    pipelines.append((nameOfScaler+'GBR' , Pipeline([('Scaler', scaler),('GBR' , GradientBoostingRegressor())])))
    


    return pipelines 

In [56]:
#models = GetScaledModel('standard')
#names,results = BasedLine2(X_train, y_train,models, 'r2')
PlotBoxR().PlotResult(names,results)


standardXG: 0.948273 (0.003038)
standardLGBM: 0.947934 (0.002718)
standardETR: 0.935061 (0.005709)
standardRFR: 0.942601 (0.003914)
standardABR: 0.791773 (0.013841)
standardGBR: 0.934641 (0.003442)


NameError: name 'PlotBoxR' is not defined

In [58]:
scaledScoreStandard = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard], axis=1)
compareModels

Unnamed: 0,Model,Score,Model.1,Score.1
0,XG,0.9484,standardXG,0.9483
1,LGBM,0.9483,standardLGBM,0.9479
2,ETR,0.9352,standardETR,0.9351
3,RFR,0.9421,standardRFR,0.9426
4,ABR,0.7993,standardABR,0.7918
5,GBR,0.9347,standardGBR,0.9346


In [60]:
models = GetScaledModel('minmax')
names,results = BasedLine2(X_train, y_train,models,'r2')
PlotBoxR().PlotResult(names,results)

scaledScoreMinMax = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard,
                          scaledScoreMinMax], axis=1)
compareModels

minmaxXG: 0.947843 (0.002995)
minmaxLGBM: 0.947903 (0.002468)
minmaxETR: 0.935693 (0.002380)
minmaxRFR: 0.942570 (0.004123)
minmaxABR: 0.796314 (0.005572)
minmaxGBR: 0.934177 (0.002943)


Unnamed: 0,Model,Score,Model.1,Score.1,Model.2,Score.2
0,XG,0.9484,standardXG,0.9483,minmaxXG,0.9478
1,LGBM,0.9483,standardLGBM,0.9479,minmaxLGBM,0.9479
2,ETR,0.9352,standardETR,0.9351,minmaxETR,0.9357
3,RFR,0.9421,standardRFR,0.9426,minmaxRFR,0.9426
4,ABR,0.7993,standardABR,0.7918,minmaxABR,0.7963
5,GBR,0.9347,standardGBR,0.9346,minmaxGBR,0.9342


In [61]:
df_t = df.copy()
df_t_name = df_t.columns


In [62]:
def TurkyOutliers(df_out,nameOfFeature,drop=False):

    valueOfFeature = df_out[nameOfFeature]
    # Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(valueOfFeature, 25.)

    # Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(valueOfFeature, 75.)

    # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = (Q3-Q1)*1.5
    # print "Outlier step:", step
    outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].index.tolist()
    feature_outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].values
    # df[~((df[nameOfFeature] >= Q1 - step) & (df[nameOfFeature] <= Q3 + step))]


    # Remove the outliers, if any were specified
    print ("Number of outliers (inc duplicates): {} and outliers: {}".format(len(outliers), feature_outliers))
    if drop:
        good_data = df_out.drop(df_out.index[outliers]).reset_index(drop = True)
        print ("New dataset with removed outliers has {} samples with {} features each.".format(*good_data.shape))
        return good_data
    else: 
        print ("Nothing happens, df.shape = ",df_out.shape)
        return df_out

Index(['Cena', 'Godiste', 'Kilometraza', 'Kubikaza', 'Snaga',
       'Brend_ALFA ROMEO', 'Brend_AUDI', 'Brend_BMW', 'Brend_CITROEN',
       'Brend_FIAT',
       ...
       'Model_STILO', 'Model_SUPERB', 'Model_TIGUAN', 'Model_TOUAREG',
       'Model_TOURAN', 'Model_V40', 'Model_WAGON R+', 'Model_XSARA',
       'Model_XSARA PICASSO', 'Model_ZAFIRA'],
      dtype='object', length=118)

In [64]:
feature_number = 1
OutLiersBox(df,df_name[feature_number])

In [65]:
feature_number = 2
OutLiersBox(df,df_name[feature_number])

In [69]:
feature_number = 5
OutLiersBox(df,df_name[feature_number])

In [70]:
df_clean = TurkyOutliers(df_t,df_name[feature_number],True)
OutLiersBox(df_clean,df_name[feature_number])

Number of outliers (inc duplicates): 474 and outliers: [26000 74000 30000 29000 24000 24000 30000 64800 71556 68000 61000 75800
 74245 76274 44150 64000 49800 80000 82467 62500 25000 60307 81000 60000
 33618 50400 46900 72000 74555 24000 48500 24000 73000 25000 24500 78500
 72000 48746 44000 28000 29010 82000 21032 73000 72500 23500 28000 28000
 25000 26000 25000 80000 57700 22500 49900 34595 78000 73497 23100 65000
 75856 54500 73694 78895 25689 22200 27100 75000 22000 21000 26200 27234
 22000 21040 82000 72000 66000 63292 30428 22222 56000 76347 21500 23000
 25000 20200 25000 75000 76000 53000 55000 43507 29460 67000 80000 80000
 27000 56425 76000 30850 39900 25000 21452 76300 82200 30000 28500 26000
 69000 76000 51999 78000 35175 57000 56000 20980 61700 53000 75000 32500
 80000 46000 20500 75000 29000 24000 78000 45726 57784 38345 37000 75000
 77200 62777 45000 63628 69980 26735 59000 45600 58573 57000 82177 82130
 50000 75000 67000 73000 41000 77000 80580 81000 29800 56036 22000 80