In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set()
from dython import nominal

In [2]:
data = pd.read_csv('model_dat.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8619 entries, 0 to 8618
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             8619 non-null   object 
 1   Year              8619 non-null   int64  
 2   Horsepower        8619 non-null   float64
 3   Engine Cylinders  8619 non-null   float64
 4   Number of Doors   8619 non-null   float64
 5   Price             8619 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 404.1+ KB


In [4]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [5]:
def model(TargetVariable , Predictors):

    X=data[Predictors].values
    y=data[TargetVariable].values

    # Split the data into training and testing set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    training_models = {
        "                     Linear Regression": LinearRegression(),
        " Linear Regression (L2 Regularization)": Ridge(),
        " Linear Regression (L1 Regularization)": Lasso(),
        "                   K-Nearest Neighbors": KNeighborsRegressor(),
        "                         Decision Tree": DecisionTreeRegressor(),
        "                         Random Forest": RandomForestRegressor(),
        "                     Gradient Boosting": GradientBoostingRegressor()
    }

    for name, training_model in training_models.items():
        training_model.fit(X_train, y_train)
        print(name + " trained.")
    print("\n")
    for name, training_model in training_models.items():
        print(name + " R^2 Score: {:.5f}".format(training_model.score(X_test, y_test)))

In [6]:
model(TargetVariable='Price',Predictors=['Year'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.11886
 Linear Regression (L2 Regularization) R^2 Score: 0.11886
 Linear Regression (L1 Regularization) R^2 Score: 0.11886
                   K-Nearest Neighbors R^2 Score: 0.05631
                         Decision Tree R^2 Score: 0.13500
                         Random Forest R^2 Score: 0.13500
                     Gradient Boosting R^2 Score: 0.13501


In [7]:
model(TargetVariable='Price',Predictors=['Horsepower'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.59098
 Linear Regression (L2 Regularization) R^2 Score: 0.59098
 Linear Regression (L1 Regularization) R^2 Score: 0.59098
                   K-Nearest Neighbors R^2 Score: 0.64333
                         Decision Tree R^2 Score: 0.71069
                         Random Forest R^2 Score: 0.71095
                     Gradient Boosting R^2 Score: 0.68708


In [8]:
model(TargetVariable='Price',Predictors=['Number of Doors'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.00087
 Linear Regression (L2 Regularization) R^2 Score: 0.00087
 Linear Regression (L1 Regularization) R^2 Score: 0.00087
                   K-Nearest Neighbors R^2 Score: -0.00022
                         Decision Tree R^2 Score: 0.00439
                         Random Forest R^2 Score: 0.00437
                     Gradient Boosting R^2 Score: 0.00439


In [9]:
model(TargetVariable='Price',Predictors=['Engine Cylinders'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.23222
 Linear Regression (L2 Regularization) R^2 Score: 0.23222
 Linear Regression (L1 Regularization) R^2 Score: 0.23222
                   K-Nearest Neighbors R^2 Score: 0.15009
                         Decision Tree R^2 Score: 0.23564
                         Random Forest R^2 Score: 0.23542
                     Gradient Boosting R^2 Score: 0.23564


In [10]:
model('Price',['Year', 'Horsepower', 'Engine Cylinders', 'Number of Doors'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.66380
 Linear Regression (L2 Regularization) R^2 Score: 0.66379
 Linear Regression (L1 Regularization) R^2 Score: 0.66379
                   K-Nearest Neighbors R^2 Score: 0.81168
                         Decision Tree R^2 Score: 0.83283
                         Random Forest R^2 Score: 0.84273
                     Gradient Boosting R^2 Score: 0.76190


### Since brand is a categorical nominal variables that contains strings unlike Engine Cylinders and Number of Doors which are Ordinal Variables, we are able to use brand to train the model.

In [11]:
# Treating all the nominal variables at once using dummy variables
MLDATA=pd.get_dummies(data)

# Printing sample rows
MLDATA


Unnamed: 0,Year,Horsepower,Engine Cylinders,Number of Doors,Price,Brand_Acura,Brand_Alfa Romeo,Brand_Audi,Brand_BMW,Brand_Buick,...,Brand_Plymouth,Brand_Pontiac,Brand_Porsche,Brand_Saab,Brand_Scion,Brand_Subaru,Brand_Suzuki,Brand_Toyota,Brand_Volkswagen,Brand_Volvo
0,2011,335.0,6.0,2.0,46135,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,300.0,6.0,2.0,40650,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2011,300.0,6.0,2.0,36350,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,230.0,6.0,2.0,29450,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2011,230.0,6.0,2.0,34500,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8614,2012,300.0,6.0,4.0,46120,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8615,2012,300.0,6.0,4.0,56670,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8616,2012,300.0,6.0,4.0,50620,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8617,2013,300.0,6.0,4.0,50920,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Using get_dummies() function, we are able to convert categorical data into dummy/indicator variables

In [12]:
# Printing all the column names for our reference
MLDATA.columns

Index(['Year', 'Horsepower', 'Engine Cylinders', 'Number of Doors', 'Price',
       'Brand_Acura', 'Brand_Alfa Romeo', 'Brand_Audi', 'Brand_BMW',
       'Brand_Buick', 'Brand_Cadillac', 'Brand_Chevrolet', 'Brand_Chrysler',
       'Brand_Dodge', 'Brand_FIAT', 'Brand_Ford', 'Brand_GMC', 'Brand_Genesis',
       'Brand_HUMMER', 'Brand_Honda', 'Brand_Hyundai', 'Brand_Infiniti',
       'Brand_Kia', 'Brand_Land Rover', 'Brand_Lexus', 'Brand_Lincoln',
       'Brand_Lotus', 'Brand_Maserati', 'Brand_Mazda', 'Brand_Mercedes-Benz',
       'Brand_Mitsubishi', 'Brand_Nissan', 'Brand_Oldsmobile',
       'Brand_Plymouth', 'Brand_Pontiac', 'Brand_Porsche', 'Brand_Saab',
       'Brand_Scion', 'Brand_Subaru', 'Brand_Suzuki', 'Brand_Toyota',
       'Brand_Volkswagen', 'Brand_Volvo'],
      dtype='object')

## Function is to test features with different training models

In [13]:
def model_regression(TargetVariable , Predictors):

    X=MLDATA[Predictors].values
    y=MLDATA[TargetVariable].values

    # Split the data into training and testing set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    training_models = {
        "                     Linear Regression": LinearRegression(),
        " Linear Regression (L2 Regularization)": Ridge(),
        " Linear Regression (L1 Regularization)": Lasso(),
        "                   K-Nearest Neighbors": KNeighborsRegressor(),
        "                         Decision Tree": DecisionTreeRegressor(),
        "                         Random Forest": RandomForestRegressor(),
        "                     Gradient Boosting": GradientBoostingRegressor()
    }

    for name, training_model in training_models.items():
        training_model.fit(X_train, y_train)
        print(name + " trained.")
    print("\n")
    for name, training_model in training_models.items():
        print(name + " R^2 Score: {:.5f}".format(training_model.score(X_test, y_test)))

# Year

In [14]:
model_regression('Price',['Year'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.13056
 Linear Regression (L2 Regularization) R^2 Score: 0.13056
 Linear Regression (L1 Regularization) R^2 Score: 0.13056
                   K-Nearest Neighbors R^2 Score: -0.00517
                         Decision Tree R^2 Score: 0.14977
                         Random Forest R^2 Score: 0.14968
                     Gradient Boosting R^2 Score: 0.14978


# Horsepower

In [15]:
model_regression('Price',['Horsepower'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.59623
 Linear Regression (L2 Regularization) R^2 Score: 0.59623
 Linear Regression (L1 Regularization) R^2 Score: 0.59623
                   K-Nearest Neighbors R^2 Score: 0.66712
                         Decision Tree R^2 Score: 0.71287
                         Random Forest R^2 Score: 0.71141
                     Gradient Boosting R^2 Score: 0.68545


# Number of Doors

In [16]:
model_regression('Price',['Number of Doors'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: -0.00171
 Linear Regression (L2 Regularization) R^2 Score: -0.00171
 Linear Regression (L1 Regularization) R^2 Score: -0.00171
                   K-Nearest Neighbors R^2 Score: -0.24915
                         Decision Tree R^2 Score: 0.00003
                         Random Forest R^2 Score: -0.00001
                     Gradient Boosting R^2 Score: 0.00003


# Engine Cylinders

In [17]:
model_regression('Price',['Engine Cylinders'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.22669
 Linear Regression (L2 Regularization) R^2 Score: 0.22669
 Linear Regression (L1 Regularization) R^2 Score: 0.22669
                   K-Nearest Neighbors R^2 Score: 0.17325
                         Decision Tree R^2 Score: 0.22443
                         Random Forest R^2 Score: 0.22433
                     Gradient Boosting R^2 Score: 0.22443


# Brand

In [18]:
model_regression('Price',['Brand_Acura', 'Brand_Alfa Romeo', 'Brand_Audi', 'Brand_BMW',
       'Brand_Buick', 'Brand_Cadillac', 'Brand_Chevrolet', 'Brand_Chrysler',
       'Brand_Dodge', 'Brand_FIAT', 'Brand_Ford', 'Brand_GMC', 'Brand_Genesis',
       'Brand_HUMMER', 'Brand_Honda', 'Brand_Hyundai', 'Brand_Infiniti',
       'Brand_Kia', 'Brand_Land Rover', 'Brand_Lexus', 'Brand_Lincoln',
       'Brand_Lotus', 'Brand_Maserati', 'Brand_Mazda', 'Brand_Mercedes-Benz',
       'Brand_Mitsubishi', 'Brand_Nissan', 'Brand_Oldsmobile',
       'Brand_Plymouth', 'Brand_Pontiac', 'Brand_Porsche', 'Brand_Saab',
       'Brand_Scion', 'Brand_Subaru', 'Brand_Suzuki', 'Brand_Toyota',
       'Brand_Volkswagen', 'Brand_Volvo'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.42856
 Linear Regression (L2 Regularization) R^2 Score: 0.42808
 Linear Regression (L1 Regularization) R^2 Score: 0.42848
                   K-Nearest Neighbors R^2 Score: 0.33461
                         Decision Tree R^2 Score: 0.42849
                         Random Forest R^2 Score: 0.42849
                     Gradient Boosting R^2 Score: 0.39824


# Combining all features to train model

# Clean Dataset

In [19]:
model_regression('Price',['Year', 'Horsepower', 'Engine Cylinders', 'Number of Doors',
       'Brand_Acura', 'Brand_Alfa Romeo', 'Brand_Audi', 'Brand_BMW',
       'Brand_Buick', 'Brand_Cadillac', 'Brand_Chevrolet', 'Brand_Chrysler',
       'Brand_Dodge', 'Brand_FIAT', 'Brand_Ford', 'Brand_GMC', 'Brand_Genesis',
       'Brand_HUMMER', 'Brand_Honda', 'Brand_Hyundai', 'Brand_Infiniti',
       'Brand_Kia', 'Brand_Land Rover', 'Brand_Lexus', 'Brand_Lincoln',
       'Brand_Lotus', 'Brand_Maserati', 'Brand_Mazda', 'Brand_Mercedes-Benz',
       'Brand_Mitsubishi', 'Brand_Nissan', 'Brand_Oldsmobile',
       'Brand_Plymouth', 'Brand_Pontiac', 'Brand_Porsche', 'Brand_Saab',
       'Brand_Scion', 'Brand_Subaru', 'Brand_Suzuki', 'Brand_Toyota',
       'Brand_Volkswagen', 'Brand_Volvo'])

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.77012
 Linear Regression (L2 Regularization) R^2 Score: 0.76916
 Linear Regression (L1 Regularization) R^2 Score: 0.76981
                   K-Nearest Neighbors R^2 Score: 0.83920
                         Decision Tree R^2 Score: 0.87757
                         Random Forest R^2 Score: 0.89316
                     Gradient Boosting R^2 Score: 0.81901


# Unclean Dataset

In [20]:
uncleaned = pd.read_csv('data.csv')
#Remove columns that are not selected.

uncleaned = uncleaned.drop(columns =["Model","Transmission Type","Driven_Wheels","Market Category","Vehicle Size","Vehicle Style", "Engine Fuel Type","Popularity"])
uncleaned = uncleaned.dropna(subset = ["Engine HP","Engine Cylinders", "Number of Doors"])
MLDATA=pd.get_dummies(uncleaned)
model_regression('MSRP',['Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors',
       'Make_Acura', 'Make_Alfa Romeo', 'Make_Audi', 'Make_BMW',
       'Make_Buick', 'Make_Cadillac', 'Make_Chevrolet', 'Make_Chrysler',
       'Make_Dodge', 'Make_FIAT', 'Make_Ford', 'Make_GMC', 'Make_Genesis',
       'Make_HUMMER', 'Make_Honda', 'Make_Hyundai', 'Make_Infiniti',
       'Make_Kia', 'Make_Land Rover', 'Make_Lexus', 'Make_Lincoln',
       'Make_Lotus', 'Make_Maserati', 'Make_Mazda', 'Make_Mercedes-Benz',
       'Make_Mitsubishi', 'Make_Nissan', 'Make_Oldsmobile',
       'Make_Plymouth', 'Make_Pontiac', 'Make_Porsche', 'Make_Saab',
       'Make_Scion', 'Make_Subaru', 'Make_Suzuki', 'Make_Toyota',
       'Make_Volkswagen', 'Make_Volvo'])



                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


                     Linear Regression R^2 Score: 0.57629
 Linear Regression (L2 Regularization) R^2 Score: 0.56790
 Linear Regression (L1 Regularization) R^2 Score: 0.57600
                   K-Nearest Neighbors R^2 Score: 0.73776
                         Decision Tree R^2 Score: 0.72970
                         Random Forest R^2 Score: 0.78771
                     Gradient Boosting R^2 Score: 0.74927


# When comparing the uncleaned dataset and the original dataset we can see that there is significant improvement to the R^2 Score


# With the conversion of categorical variables using get_dummies(), the r^2 score is significantly improved. Additionally, we can see that when the features are used individually, the R^2 value is very low. However, when they are all combined, the R^2 value is greatly increased (From 0% to 60% to 80%-90%)