In [3]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set()
from dython import nominal
from dython import nominal

In [4]:
data = pd.read_csv('model_dat.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9846 entries, 0 to 9845
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Brand             9846 non-null   object
 1   Year              9846 non-null   int64 
 2   Horsepower        9846 non-null   int64 
 3   Engine Cylinders  9846 non-null   int64 
 4   Number of Doors   9846 non-null   int64 
 5   Price             9846 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 461.7+ KB


In [6]:
# Treating all the nominal variables at once using dummy variables
MLDATA=pd.get_dummies(data)

# Printing sample rows
MLDATA


Unnamed: 0,Year,Horsepower,Engine Cylinders,Number of Doors,Price,Brand_Acura,Brand_Alfa Romeo,Brand_Audi,Brand_BMW,Brand_Buick,...,Brand_Plymouth,Brand_Pontiac,Brand_Porsche,Brand_Saab,Brand_Scion,Brand_Subaru,Brand_Suzuki,Brand_Toyota,Brand_Volkswagen,Brand_Volvo
0,2011,335,6,2,46135,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,300,6,2,40650,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2011,300,6,2,36350,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,230,6,2,29450,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2011,230,6,2,34500,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9841,2012,300,6,4,46120,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9842,2012,300,6,4,56670,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9843,2012,300,6,4,50620,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9844,2013,300,6,4,50920,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Printing all the column names for our reference
MLDATA.columns


Index(['Year', 'Horsepower', 'Engine Cylinders', 'Number of Doors', 'Price',
       'Brand_Acura', 'Brand_Alfa Romeo', 'Brand_Audi', 'Brand_BMW',
       'Brand_Buick', 'Brand_Cadillac', 'Brand_Chevrolet', 'Brand_Chrysler',
       'Brand_Dodge', 'Brand_FIAT', 'Brand_Ford', 'Brand_GMC', 'Brand_Genesis',
       'Brand_HUMMER', 'Brand_Honda', 'Brand_Hyundai', 'Brand_Infiniti',
       'Brand_Kia', 'Brand_Land Rover', 'Brand_Lexus', 'Brand_Lincoln',
       'Brand_Lotus', 'Brand_Maserati', 'Brand_Mazda', 'Brand_Mercedes-Benz',
       'Brand_Mitsubishi', 'Brand_Nissan', 'Brand_Oldsmobile',
       'Brand_Plymouth', 'Brand_Pontiac', 'Brand_Porsche', 'Brand_Saab',
       'Brand_Scion', 'Brand_Subaru', 'Brand_Suzuki', 'Brand_Toyota',
       'Brand_Volkswagen', 'Brand_Volvo'],
      dtype='object')

In [8]:
# Separate Target Variable and Predictor Variables
TargetVariable='Price'
Predictors=['Year', 'Horsepower', 'Engine Cylinders', 'Number of Doors',
       'Brand_Acura', 'Brand_Alfa Romeo', 'Brand_Audi', 'Brand_BMW',
       'Brand_Buick', 'Brand_Cadillac', 'Brand_Chevrolet', 'Brand_Chrysler',
       'Brand_Dodge', 'Brand_FIAT', 'Brand_Ford', 'Brand_GMC', 'Brand_Genesis',
       'Brand_HUMMER', 'Brand_Honda', 'Brand_Hyundai', 'Brand_Infiniti',
       'Brand_Kia', 'Brand_Land Rover', 'Brand_Lexus', 'Brand_Lincoln',
       'Brand_Lotus', 'Brand_Maserati', 'Brand_Mazda', 'Brand_Mercedes-Benz',
       'Brand_Mitsubishi', 'Brand_Nissan', 'Brand_Oldsmobile',
       'Brand_Plymouth', 'Brand_Pontiac', 'Brand_Porsche', 'Brand_Saab',
       'Brand_Scion', 'Brand_Subaru', 'Brand_Suzuki', 'Brand_Toyota',
       'Brand_Volkswagen', 'Brand_Volvo']

X=MLDATA[Predictors].values
y=MLDATA[TargetVariable].values

# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [10]:
training_models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

for name, training_model in training_models.items():
    training_model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [11]:
for name, training_model in training_models.items():
    print(name + " R^2 Score: {:.5f}".format(training_model.score(X_test, y_test)))

                     Linear Regression R^2 Score: 0.81078
 Linear Regression (L2 Regularization) R^2 Score: 0.81070
 Linear Regression (L1 Regularization) R^2 Score: 0.81080
                   K-Nearest Neighbors R^2 Score: 0.90868
                         Decision Tree R^2 Score: 0.93507
                         Random Forest R^2 Score: 0.94161
                     Gradient Boosting R^2 Score: 0.89852
