In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import yellowbrick.regressor as yb
from yellowbrick.regressor import ResidualsPlot
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r'C:/Users/syahi/Master/Project 2/Data/Cleaned Data/Car Prices.csv', low_memory=False)
df.head()

Unnamed: 0,Manufacturing Dates,Brand,Model,Trim,Type,Transmission,State,Condition,Mileage,Exterior Colour,Interior Colour,Selling Price,Selling Dates
0,1/1/2014,VOLKSWAGEN,PASSAT,TDI SE,SEDAN,AUTOMATIC,NJ,2.7,25969,WHITE,BLACK,21400,3/6/2015
1,1/1/2014,FORD,ESCAPE,SE,SUV,AUTOMATIC,NY,2.7,41040,WHITE,BLACK,14600,17/6/2015
2,1/1/2014,CHEVROLET,SILVERADO 1500,LT,CREW CAB,AUTOMATIC,TX,2.7,6040,WHITE,BLACK,34300,17/6/2015
3,1/1/2014,DODGE,CHARGER,SXT,SEDAN,AUTOMATIC,TX,2.7,45682,WHITE,BLACK,16000,27/5/2015
4,1/1/2014,BUICK,LACROSSE,LEATHER GROUP,SEDAN,AUTOMATIC,WI,2.7,12558,WHITE,BLACK,17200,11/2/2015


**Convert the State and Type columns to categorical variables**

In [3]:
df["State"] = df["State"].astype("category")
df["Type"] = df["Type"].astype("category")

**Convert the Transmission, Brand, Model, Trim, Exterior Colour, and Interior Colour columns to dummy variables (one-hot encoding)**

In [4]:
df = pd.get_dummies(df, columns=["Transmission", "Brand", "Model", "Trim", "Exterior Colour", "Interior Colour"])

**Add the following lines to overwrite the State and Type columns with their encoded values**

In [5]:
state_encoded = pd.get_dummies(df["State"], prefix="State").add_prefix("State_")
type_encoded = pd.get_dummies(df["Type"], prefix="Type").add_prefix("Type_")
df = df.drop(["State", "Type"], axis=1).join([state_encoded, type_encoded])

__Assigning 'df' = DataFrame and 'dates' = column with date strings__

In [6]:
df['Manufacturing Dates'] = pd.to_datetime(df['Manufacturing Dates'], errors='coerce')  # Convert to datetime, coercing errors to NaT

__Create a new mask to filter out any non-converted date strings__

In [7]:
valid_dates_mask = df['Manufacturing Dates'].notna()

__Apply the .dt accessor only on the valid date values__

In [8]:
if valid_dates_mask.sum() > 0:
    df.loc[valid_dates_mask, 'Manufacturing Year'] = df.loc[valid_dates_mask, 'Manufacturing Dates'].dt.year.astype(int)

__Assigning 'df' = DataFrame and 'dates' = column with date strings__

In [9]:
df['Selling Dates'] = pd.to_datetime(df['Selling Dates'], errors='coerce')  # Convert to datetime, coercing errors to NaT

__Create a new mask to filter out any non-converted date strings__

In [10]:
valid_dates_mask = df['Selling Dates'].notna()

__Apply the .dt accessor only on the valid date values__

In [11]:
if valid_dates_mask.sum() > 0:
     df.loc[valid_dates_mask, 'Selling Year'] = df.loc[valid_dates_mask, 'Selling Dates'].dt.year.astype(int)

**Remove rows with invalid dates**

In [12]:
df.dropna(subset=['Selling Dates'], inplace=True)
df.dropna(subset=['Manufacturing Dates'], inplace=True)

In [13]:
df = df.drop(['Manufacturing Dates', 'Selling Dates'], axis=1)

**Split the dataset into features and target**

In [14]:
X = df.drop("Selling Price", axis=1)
y = df["Selling Price"]

**Split the dataset into training and testing subsets**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**The x_train and x_test values are the feature (input) variables used for training and testing a machine learning model**

In [16]:
X_train

Unnamed: 0,Condition,Mileage,Transmission_AUTOMATIC,Transmission_MANUAL,Brand_ACURA,Brand_ASTON MARTIN,Brand_AUDI,Brand_BENTLEY,Brand_BMW,Brand_BUICK,...,Type_Type_SUPERCAB,Type_Type_SUPERCREW,Type_Type_SUV,Type_Type_TRANSIT VAN,Type_Type_TSX SPORT WAGON,Type_Type_VAN,Type_Type_WAGON,Type_Type_XTRACAB,Manufacturing Year,Selling Year
422401,1.9,309892,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1998.0,2015.0
434773,4.3,8562,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2015.0,2015.0
70130,3.4,56438,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2013.0,2015.0
322723,3.1,84912,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2008.0,2015.0
227795,2.2,43978,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2006.0,2015.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285339,1.9,41996,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2013.0,2015.0
250184,5.0,6305,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,2015.0,2015.0
309389,2.6,50443,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2013.0,2015.0
339142,2.5,141299,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,2008.0,2015.0


In [17]:
X_test

Unnamed: 0,Condition,Mileage,Transmission_AUTOMATIC,Transmission_MANUAL,Brand_ACURA,Brand_ASTON MARTIN,Brand_AUDI,Brand_BENTLEY,Brand_BMW,Brand_BUICK,...,Type_Type_SUPERCAB,Type_Type_SUPERCREW,Type_Type_SUV,Type_Type_TRANSIT VAN,Type_Type_TSX SPORT WAGON,Type_Type_VAN,Type_Type_WAGON,Type_Type_XTRACAB,Manufacturing Year,Selling Year
5240,4.6,32075,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2013.0,2015.0
328913,2.9,47590,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2013.0,2015.0
147757,1.9,90813,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2009.0,2015.0
76460,2.4,144775,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,2005.0,2015.0
251507,1.9,65083,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2007.0,2015.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54576,3.9,141542,True,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,2011.0,2015.0
273471,4.2,51230,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2013.0,2015.0
258054,2.0,156243,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,2004.0,2015.0
346282,2.9,52147,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2012.0,2015.0


**The x_train and y_train gives value of output. In this case, output = Selling Price**

In [18]:
y_train

422401     3400
434773     9500
70130     11100
322723    12900
227795     5800
          ...  
285339     7700
250184    21600
309389    17100
339142     4000
289471     8900
Name: Selling Price, Length: 145384, dtype: int64

In [19]:
y_test

5240      18500
328913     9400
147757     7500
76460      2800
251507     2000
          ...  
54576     11800
273471     9701
258054     7200
346282     9100
425821     1600
Name: Selling Price, Length: 36346, dtype: int64

**Create and fit the RandomForestRegressor model**

In [35]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=15)

In [None]:
rf.fit(X_train, y_train)

__Make predictions__

In [30]:
y_pred = rf.predict(X_test)
y_pred

array([20181.5 , 10444.  ,  6466.  , ...,  6608.  ,  8244.03,  1745.75])

In [31]:
x_pred = rf.predict(X_train)
x_pred

array([ 2787.5 ,  9761.  , 10702.52, ..., 16931.  ,  4048.  ,  8631.  ])

**Calculate RMSE, MAE, R² score and RF Confidence**

In [29]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rf_confidence = rf.score(X_test, y_test)

ValueError: Input contains infinity or a value too large for dtype('float64').

In [None]:
rmse2 = np.sqrt(mean_squared_error(y_train, x_pred))
mae2 = mean_absolute_error(y_train, x_pred)
r22 = r2_score(y_train, x_pred)
rf_confidence2 = rf.score(X_train, y_train)

**Print the results**

In [None]:
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")
print("RF Confidence: ", rf_confidence)

In [None]:
print(f"RMSE: {rmse2:.2f}")
print(f"MAE: {mae2:.2f}")
print(f"R²: {r22:.2f}")
print("RF Confidence: ", rf_confidence2)