In [None]:
import pandas as pd

In [None]:
# Import the dataset into a DataFrame
path = r'data/audi.csv'
df = pd.read_csv(path, converters={col: str.strip for col in ['model']})

In [None]:
# Make sure there are no NULL values in the dataset
print(df.isnull().sum())

model           0
year            0
price           0
transmission    0
mileage         0
fuel_type       0
tax             0
mpg             0
engine_size     0
brand           0
dtype: int64


In [None]:
# Let's do a quick overview of the data
df.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engine_size
count,10668.0,10668.0,10668.0,10668.0,10668.0,10668.0
mean,2017.100675,22896.685039,24827.244001,126.011436,50.770022,1.930709
std,2.167494,11714.841888,23505.257205,67.170294,12.949782,0.602957
min,1997.0,1490.0,1.0,0.0,18.9,0.0
25%,2016.0,15130.75,5968.75,125.0,40.9,1.5
50%,2017.0,20200.0,19000.0,145.0,49.6,2.0
75%,2019.0,27990.0,36464.5,145.0,58.9,2.0
max,2020.0,145000.0,323000.0,580.0,188.3,6.3


In [None]:
# One-hot encoding categorical variables
data_onehot = pd.get_dummies(df,columns=['model', 'transmission','fuel_type'])
data_onehot

Unnamed: 0,year,price,mileage,tax,mpg,engine_size,brand,model_A1,model_A2,model_A3,...,model_S8,model_SQ5,model_SQ7,model_TT,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuel_type_Diesel,fuel_type_Hybrid,fuel_type_Petrol
0,2017,12500,15735,150,55.4,1.4,Audi,True,False,False,...,False,False,False,False,False,True,False,False,False,True
1,2016,16500,36203,20,64.2,2.0,Audi,False,False,False,...,False,False,False,False,True,False,False,True,False,False
2,2016,11000,29946,30,55.4,1.4,Audi,True,False,False,...,False,False,False,False,False,True,False,False,False,True
3,2017,16800,25952,145,67.3,2.0,Audi,False,False,False,...,False,False,False,False,True,False,False,True,False,False
4,2019,17300,1998,145,49.6,1.0,Audi,False,False,True,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10663,2020,16999,4018,145,49.6,1.0,Audi,False,False,True,...,False,False,False,False,False,True,False,False,False,True
10664,2020,16999,1978,150,49.6,1.0,Audi,False,False,True,...,False,False,False,False,False,True,False,False,False,True
10665,2020,17199,609,150,49.6,1.0,Audi,False,False,True,...,False,False,False,False,False,True,False,False,False,True
10666,2017,19499,8646,150,47.9,1.4,Audi,False,False,False,...,False,False,False,False,True,False,False,False,False,True


In [None]:
# Create training and testing datasets
# X is the feature set, and y is the target variable (price)
from sklearn.model_selection import train_test_split
#data_onehot = data_onehot.drop(['brand'],axis=1)
X = data_onehot.drop(['price'],axis=1)
y = data_onehot['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=25)

In [None]:
# Train a linear regression model and save it with pickle
from sklearn.linear_model import LinearRegression
import pickle
regressor = LinearRegression()
regressor.fit(X_train,y_train)
regressor.score(X,y)

# Save the trained model to a file
with open('used_car_price.pkl', 'wb') as model_file:
    pickle.dump(regressor, model_file)


Index(['year', 'mileage', 'tax', 'mpg', 'engine_size', 'model_A1', 'model_A2',
       'model_A3', 'model_A4', 'model_A5', 'model_A6', 'model_A7', 'model_A8',
       'model_Q2', 'model_Q3', 'model_Q5', 'model_Q7', 'model_Q8', 'model_R8',
       'model_RS3', 'model_RS4', 'model_RS5', 'model_RS6', 'model_RS7',
       'model_S3', 'model_S4', 'model_S5', 'model_S8', 'model_SQ5',
       'model_SQ7', 'model_TT', 'transmission_Automatic',
       'transmission_Manual', 'transmission_Semi-Auto', 'fuel_type_Diesel',
       'fuel_type_Hybrid', 'fuel_type_Petrol'],
      dtype='object')


0.8916373387034444

In [None]:
# Predicting the price of a new car
new_data = {
    'year': [2020],
    'mileage': [5000],
    'tax': [150],
    'mpg': [50.0],
    'engine_size': [1.4],
    'model_A1': [0],
    'model_A2': [0],
    'model_A3': [0],
    'model_A4': [0],
    'model_A5': [0],
    'model_A6': [0],
    'model_A7': [0],
    'model_A8': [0],
    'model_Q2': [0],
    'model_Q3': [0],
    'model_Q5': [0],
    'model_Q7': [0],
    'model_Q8': [0],
    'model_R8': [1],
    'model_RS3': [0],
    'model_RS4': [0],
    'model_RS5': [0],
    'model_RS6': [0],
    'model_RS7': [0],
    'model_S3': [0],
    'model_S4': [0],
    'model_S5': [0],
    'model_S8': [0],
    'model_SQ5': [0],
    'model_SQ7': [0],
    'model_TT': [0],
    'transmission_Automatic': [0],
    'transmission_Manual': [1],
    'transmission_Semi-Auto': [0],
    'fuel_type_Diesel': [0],
    'fuel_type_Hybrid': [0],
    'fuel_type_Petrol': [1],
}

# Convert the new input data to a DataFrame
import pandas as pd
new_input = pd.DataFrame(new_data)

# Ensure the columns match the training data
new_input = new_input[X_train.columns]

# Make predictions using the trained model
predicted_price = regressor.predict(new_input)
print("Predicted Price:", predicted_price[0])

year                        int64
mileage                     int64
tax                         int64
mpg                       float64
engine_size               float64
model_A1                    int64
model_A2                    int64
model_A3                    int64
model_A4                    int64
model_A5                    int64
model_A6                    int64
model_A7                    int64
model_A8                    int64
model_Q2                    int64
model_Q3                    int64
model_Q5                    int64
model_Q7                    int64
model_Q8                    int64
model_R8                    int64
model_RS3                   int64
model_RS4                   int64
model_RS5                   int64
model_RS6                   int64
model_RS7                   int64
model_S3                    int64
model_S4                    int64
model_S5                    int64
model_S8                    int64
model_SQ5                   int64
model_SQ7     