In [89]:
import numpy as np 
import pandas as pd 
# import os
# for dirname, _, filenames in os.walk(r'C:\Users\ELCOT\Downloads\dataset'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import normaltest
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
class car_price_model:
    def __init__(self, data=None, cols=None, name='price'):
        
        self.name = name 
        self.data = data 
        self.cols = cols 
        self.listof_model = {'LinearRegression': LinearRegression(), 
                'KNeighborsRegression':KNeighborsRegressor(),
                'RandomForestRegression': RandomForestRegressor(),
               'GradientBoostingRegression': GradientBoostingRegressor(),
                'XGBoostRegression': XGBRegressor(),
                'adaboost':AdaBoostRegressor()} 
    
    
    def read(self, file):
        return pd.read_csv(file)
    
    def multi_categorical_plot(self, data):
        string = []
        for i in data.columns:
            if data[i].dtypes == "object":
                string.append(i)
    
        fig = plt.figure(figsize=(20,5))
        fig.subplots_adjust(wspace=0.2, hspace = 0.3)
        for i in range(1,len(string)+1):
            ax = fig.add_subplot(2,3,i)
            sns.countplot(x=string[i-1], data=data, ax=ax)
            ax.set_title(f" {string[i-1]} countplot")
            
    def distplot_multi(self, data):
        from scipy.stats import norm
        cols = []
        
 
        for i in data.columns:
            if data[i].dtypes == "float64" or data[i].dtypes == 'int64':
                cols.append(i)
        
        gp = plt.figure(figsize=(15,10))
        gp.subplots_adjust(wspace=0.4, hspace=0.4)
        for i in range(1, len(cols)+1):
            ax = gp.add_subplot(2,3,i)
            sns.distplot(data[cols[i-1]], fit=norm, kde=False)
            ax.set_title('{} max. likelihood gaussian'.format(cols[i-1]))
            
    def boxplot_multi(self, data):
        cols = []
        for i in data.columns:
            if data[i].dtypes == "float64" or data[i].dtypes == 'int64':
                cols.append(i)
    
        gp = plt.figure(figsize=(15,10))
        gp.subplots_adjust(wspace=0.4, hspace=0.4)
        for i in range(1, len(cols)+1):
            ax = gp.add_subplot(2,3,i)
            sns.boxplot(x = cols[i-1], data=data)
            ax.set_title('Boxplot for {}'.format(cols[i-1]))
            
    def correlation_plot1(self, data, vrs= 'price'):  
        cols = []
        for i in data.columns:
            if data[i].dtypes == "float64" or data[i].dtypes == 'int64':
                cols.append(i)
                
        feat = list(set(cols) - set([vrs]))
    
        fig = plt.figure(figsize=(15,10))
        fig.subplots_adjust(wspace = 0.3, hspace = 0.25)
        for i in range(1,len(feat)+1):
        
            gp = data.groupby(feat[i-1]).agg('mean').reset_index()
        
            if len(feat) < 3:
                ax = fig.add_subplot(1,3,i)
            else:
                n = len(feat)//2 + 1
                ax = fig.add_subplot(2,n,i)
            
            ax.scatter(data[feat[i-1]], data[vrs], alpha=.25)
            ax.plot(gp[feat[i-1]], gp[vrs], 'r-', label='mean',  linewidth=1.5)
            ax.set_xlabel(feat[i-1])
            ax.set_ylabel(vrs)
            ax.set_title('Plotting data {0} vs {1}'.format(vrs, feat[i-1]))
            ax.legend(loc='best')

    def correlation_plot(self, data, vrs='price'):
        numeric_data = data.select_dtypes(include=['int64', 'float64'])
        cols = numeric_data.columns
        feat = list(set(cols) - set([vrs]))
    
        fig = plt.figure(figsize=(15, 10))
        fig.subplots_adjust(wspace=0.3, hspace=0.25)
        for i in range(1, len(feat)+1):
            gp = numeric_data.groupby(feat[i-1]).agg('mean').reset_index()
    
            if len(feat) < 3:
                ax = fig.add_subplot(1, 3, i)
            else:
                n = len(feat)//2 + 1
                ax = fig.add_subplot(2, n, i)
    
            ax.scatter(data[feat[i-1]], data[vrs], alpha=.25)
            ax.plot(gp[feat[i-1]], gp[vrs], 'r-', label='mean', linewidth=1.5)
            ax.set_xlabel(feat[i-1])
            ax.set_ylabel(vrs)
            ax.set_title(f'Plotting data {vrs} vs {feat[i-1]}')
            ax.legend(loc='best')
            
    
    def standardize(self, data):
        data = (data - data.mean())/data.std()
        return data
            
            
    def VIF(self, data):

        std_data = data.apply(self.standardize, axis=0)
    
        from statsmodels.stats.outliers_influence import variance_inflation_factor
    
        vif = pd.DataFrame()
        vif['VIF_FACTOR'] = [variance_inflation_factor(std_data.values, i) for i in range(std_data.shape[1])]
    
        vif['feature'] = std_data.columns
    
        return vif
    
    
    def split_data(self):
        train = self.data[self.cols]
        target = self.data[self.name]
    
        return train_test_split(train, target, random_state=42, test_size=0.2, shuffle=True)
    
    def spearman_pearson_correlation1(self, data):
        gp = plt.figure(figsize=(15,5))
        cols = ['pearson', 'spearman']
        gp.subplots_adjust(wspace=0.4, hspace=0.4)
        for i in range(1, len(cols)+1):
            ax = gp.add_subplot(1,2,i)
            sns.heatmap(data.corr(method=cols[i-1]), annot=True)
            ax.set_title('{} correlation'.format(cols[i-1]))
        plt.show()

    def spearman_pearson_correlation(self, data):
        numeric_data = data.select_dtypes(include=['int64', 'float64'])
        gp = plt.figure(figsize=(15, 5))
        cols = ['pearson', 'spearman']
        gp.subplots_adjust(wspace=0.4, hspace=0.4)
        for i in range(1, len(cols)+1):
            ax = gp.add_subplot(1, 2, i)
            sns.heatmap(numeric_data.corr(method=cols[i-1]), annot=True, cmap="coolwarm")
            ax.set_title(f'{cols[i-1]} correlation')
        plt.show()
    
    
    def learner_selection(self):
        result = {}
        
        x, _, y, _ = self.split_data() 
    
        for cm in list(self.listof_model.items()):
        
            name = cm[0]
            model = cm[1]
        
            cvs = cross_val_score(model, x, y, cv=10).mean()
            ypred = cross_val_predict(model, x, y, cv=10)
            r2 = r2_score(y, ypred)
            mse = mean_squared_error(y, ypred)
            mae = mean_absolute_error(y, ypred)
            rmse = np.sqrt(mse)
        
            result[name] = {'cross_val_score': cvs, 'rmse': rmse, 'mae': mae, 'r2': r2}
        
            print('{} model done !!!'.format(name))
        
        
        return pd.DataFrame(result)
    
    
    def training_evaluate(self, algorithm):
        
        result = {}
        xtrain, xtest, ytrain, ytest = self.split_data()
        
        learner = self.listof_model[algorithm] 
        
        model = learner.fit(xtrain, ytrain)
        ypred = model.predict(xtest)
        
        r2 = learner.score(xtest, ytest)
        rmse =  np.sqrt(mean_squared_error(ytest, ypred))
        mae = mean_absolute_error(ytest, ypred)
        
        result['car price measure'] = {'r2':round(r2, 3),  'rmse':round(rmse, 3), 'mae':round(mae, 3)}
        
        return  pd.DataFrame(result)
        

### car_price_model class explaination

**car_price_model** is the class that I use to do exploratory data analysis and machine learning in each data car. This class have 10 attributes that are:

- multi_categorical_plot

- distplot_multi

- boxplot_multi

- spearman_pearson_correlation

- correlation_plot

- VIF

- learner_selection

- training_evaluate

These are the function that we are going to use in this notebook. 

**N.B: We are making prediction price for 5 cars (focus, audi, ford, toyota, skoda)** 

In [None]:
car1 = r'C:\Users\ELCOT\Downloads\dataset\focus.csv'
car2 = r'C:\Users\ELCOT\Downloads\dataset\audi.csv'
car3 = r'C:\Users\ELCOT\Downloads\dataset\ford.csv'
car4 = r'C:\Users\ELCOT\Downloads\dataset\toyota.csv'
car5 = r'C:\Users\ELCOT\Downloads\dataset\skoda.csv'

In [None]:
model = car_price_model()

# Focus car price

In [None]:
focus = model.read(car1)

In [None]:
focus.head()

In [None]:
focus.info()

## Visualization, correlation, VIF, learner selection, training and evaluation

In [None]:
model.multi_categorical_plot(focus)

In [None]:
sns.countplot(x = 'model', hue='fuelType', data=focus)

In [None]:
focus.describe()

In [None]:
model.distplot_multi(focus)

In [None]:
model.boxplot_multi(focus) # we see well that our maximun likelihood gaussian go with our boxplot. 

In [None]:

model.spearman_pearson_correlation(focus)

In [None]:
model.correlation_plot(focus)

with this two correlations we can see that **price is most correlated with mileage and year**. also year well correlated with mileage. We use VIF to see how this correlation are.

In [None]:
focus_cols = ['mileage', 'year', 'engineSize'] #take columns

In [None]:
model.VIF(focus[focus_cols])

In [None]:
focus_model = car_price_model(data=focus, cols=focus_cols) 

In [None]:
focus_model.learner_selection()

In [None]:
focus_model.training_evaluate('GradientBoostingRegression')

**We have** $R^2 = 92.1\%$ **for Focus car**

# Audi car price

In [None]:
audi = model.read(car2)

In [None]:
audi.head()

In [None]:
audi.info()

## Visualization, correlation, VIF, learner selection, training and evaluation

In [None]:
model.multi_categorical_plot(audi)

In [None]:
audi.describe()

In [None]:
model.distplot_multi(audi)

In [None]:
model.boxplot_multi(audi)

In [None]:
model.spearman_pearson_correlation(audi)

In [None]:
model.correlation_plot(audi)

In [None]:
audi_cols = ['year', 'mileage', 'mpg', 'engineSize', 'tax'] 

In [None]:
model.VIF(audi[audi_cols]) 

In [None]:

audi_model = car_price_model(data=audi, cols=audi_cols)

In [None]:
audi_model.learner_selection()

In [None]:
audi_model.training_evaluate('XGBoostRegression')

**We have** $R^2 = 94.5\%$ **for Audi car**

# Ford car price

In [None]:
ford = model.read(car3)

In [None]:
ford.head()

In [None]:
ford.info()

In [None]:
ford = ford.replace(to_replace=2060, value=2016) #some errors

## Visualization, correlation, VIF, learner selection, training and evaluate

In [None]:
model.multi_categorical_plot(ford) 

In [None]:
ford.describe()

In [None]:
model.distplot_multi(ford)

In [None]:
model.boxplot_multi(ford)

In [None]:
model.spearman_pearson_correlation(ford)

In [None]:
model.correlation_plot(ford)

In [None]:
ford_cols = ['mileage', 'year', 'tax', 'engineSize', 'mpg']

In [None]:
model.VIF(ford[ford_cols])

In [None]:
ford_model = car_price_model(data=ford, cols=ford_cols)

In [None]:
ford_model.learner_selection()

In [None]:
ford_model.training_evaluate('XGBoostRegression')

**We get** $R^2 = 91.6\%$ **for Ford car**

# Toyota car price

In [None]:
toyota = model.read(car4)

In [None]:
toyota.head()

In [None]:
toyota.info()

## Visualization, correlation, VIF, learner selection, training and evaluate

In [None]:
model.multi_categorical_plot(toyota)

In [None]:
toyota.describe()

In [None]:
model.distplot_multi(toyota)

In [None]:
model.boxplot_multi(toyota)

In [None]:
model.spearman_pearson_correlation(toyota)

In [None]:
model.correlation_plot(toyota)

In [None]:
toyota_cols = ['engineSize','year','tax', 'mileage', 'mpg']

In [None]:
model.VIF(toyota[toyota_cols])

In [None]:
toyota_model = car_price_model(data=toyota, cols=toyota_cols)

In [None]:
toyota_model.learner_selection()

In [None]:
toyota_model.training_evaluate('XGBoostRegression')

**We obtain** $R^2 = 96.2\%$ **for Toyota car.**

# Skoda car price

In [None]:
skoda = model.read(car5)

In [None]:
skoda.head()

In [None]:
skoda.info()

## Visualization, correlation, VIF, learner selection, training and evaluate

In [None]:
model.multi_categorical_plot(skoda)

In [None]:
skoda.describe()

In [None]:
model.distplot_multi(skoda)

In [None]:
model.boxplot_multi(skoda)

In [None]:
model.spearman_pearson_correlation(skoda)

In [None]:
model.correlation_plot(skoda)

In [None]:
skoda_cols = ['year', 'engineSize', 'mileage', 'tax', 'mpg']

In [None]:
model.VIF(skoda[skoda_cols])

In [None]:
skoda_model = car_price_model(data=skoda, cols=skoda_cols)

In [None]:
skoda_model.learner_selection()

In [None]:
skoda_model.training_evaluate('XGBoostRegression')

**We obtain** $R^2 = 92.6\%$ **for Skoda car.**

### Summarize

For this five cars, we obtain de $R^2$ score:

> Skoda car $92.6\%$

> Toyota car $96.2\%$

> Ford car $91.6\%$

> Audi car $94.5\%$

> Focus car $92.1\%$

