In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-02-car-price/data.csv'

In [3]:
filename = "data.csv"
if not os.path.isfile(filename):
    !wget -O {filename} 02-price-pred-proj/data.csv
else:
    print(f"{filename} already exists, skipping download.")

data.csv already exists, skipping download.


In [4]:
df = pd.read_csv('data.csv')

In [5]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


# Normalise the data

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

### Examine data types - find strings which are objects and normalise

In [7]:
df.dtypes

make                  object
model                 object
year                   int64
engine_fuel_type      object
engine_hp            float64
engine_cylinders     float64
transmission_type     object
driven_wheels         object
number_of_doors      float64
market_category       object
vehicle_size          object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
popularity             int64
msrp                   int64
dtype: object

### get a list of cols which contain strings

In [8]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style']

### loop over all string cols and make their values 

In [9]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

# Exploring the data

In [10]:
for col in df.columns:
    print(col)
    #print("Name of col:", df[col].head())
    print("Number of unique values in col:", df[col].nunique())
    print("Unique values:", df[col].unique())
    print()
    #popularity col = how many mentions on twitter

make
Number of unique values in col: 48
Unique values: ['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler' 'nissan' 'volvo' 'mazda'
 'mitsubishi' 'ferrari' 'alfa_romeo' 'toyota' 'mclaren' 'maybach'
 'pontiac' 'porsche' 'saab' 'gmc' 'hyundai' 'plymouth' 'honda'
 'oldsmobile' 'suzuki' 'ford' 'cadillac' 'kia' 'bentley' 'chevrolet'
 'dodge' 'lamborghini' 'lincoln' 'subaru' 'volkswagen' 'spyker' 'buick'
 'acura' 'rolls-royce' 'maserati' 'lexus' 'aston_martin' 'land_rover'
 'lotus' 'infiniti' 'scion' 'genesis' 'hummer' 'tesla' 'bugatti']

model
Number of unique values in col: 914
Unique values: ['1_series_m' '1_series' '100' '124_spider' '190-class' '2_series' '200'
 '200sx' '240sx' '240' '2' '3_series_gran_turismo' '3_series' '300-class'
 '3000gt' '300' '300m' '300zx' '323' '350-class' '350z' '360' '370z' '3'
 '4_series_gran_coupe' '4_series' '400-class' '420-class' '456m'
 '458_italia' '4c' '4runner' '5_series_gran_turismo' '5_series'
 '500-class' '500e' '500' '500l' '500x' '550' '560-class' '

### Distribution of price

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

# display the plots in the notebook:
%matplotlib inline

ModuleNotFoundError: No module named 'seaborn'

### price distribution
### draw a histogram to plot it

In [None]:
sns.histplot(df.msrp, bins=50)
#le6 scale means = 10 to the power of 6 so 1,000,000 so millions
#this shows long tail distribution - there are a few large values but majority are below 100,000

In [None]:
# zoom in to the graph where majority of data concentrated on the plot above
sns.histplot(df.msrp[df.msrp < 100000], bins=50)
# mean cost of cars about 25K, min price is 1k

In [None]:
sns.histplot(df.msrp[(df.msrp >= 100000) & (df.msrp <= 249999)], bins=50)
# cars above 100K but below 250K

In [None]:
sns.histplot(df.msrp[df.msrp > 250000], bins=50)
# cars above 250K

### Having a long tail distribution is bad for training a model bcause there's not enough data to generalise edge cases
### Apply a logarithmic transformation

In [None]:
price_logs = np.log1p(df.msrp)
price_logs

### draw the histogram again
#### all the large prices collapse - looks closer to normal distribution now than long tail distribution

In [None]:
sns.histplot(price_logs, bins=50)

### Missing values
#### Nan means missing number

In [None]:
# count the nan cells
df.isnull().sum()

# also can use df.isnull() to see them but not very useful

# Setting up the validation framework

### Split the data

In [None]:
# split data in to TRAIN, VALIDATION and TEST
# usually 70, 20, 10

n = len(df)

# use int() method to round up when splitting numbers

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = int(n * 0.6)

# test length is same
n, n_val + n_test + n_train
#11914, 11912 - UNEVEN

In [None]:
# to fix we do this instead:
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# test length
n, n_val + n_test + n_train

### Sizes of the split parts

In [None]:
n_val, n_test, n_train
#2382, 2382, 7150

#### Split the dataset

In [None]:
df_val = df.iloc[:n_val]
df_test = df.iloc[n_val:n_val+n_test]
df_train = df.iloc[n_val+n_test:]

In [None]:
# test variables hold the right splits:
df_train # starts at row 4764

### Shuffle the records

In [None]:
# create index
idx = np.arange(n)
#declare a seed to make randomisation repeatable
np.random.seed(2)
# shuffle
np.random.shuffle(idx)
print(idx)

In [None]:
#get rows for split sets via shuffled index

df_val = df.iloc[idx[:n_val]]
df_test = df.iloc[idx[n_val:n_val+n_test]]
df_train = df.iloc[idx[:n_train]]

len(df_train), len(df_val), len(df_test), 

In [None]:
df_train

In [None]:
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)

### transform y

In [None]:
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

In [None]:
# del this target so we don't accidently use for training purposes
del df_train['msrp']
del df_test['msrp']
del df_val['msrp']

# Linear Regression

#### g(x)~y
#### model / feature matrix / target

# Linear Regression vector form
### (similar to matrix to matrix multiplication)

#### start with dot product (vector vector multiplication)

In [None]:
wi = 

In [None]:
def dot(xi, w):
    n = len(wi)

    res = 0.0

    for j in range(n):
        res = res + xi[j] * w[j]

        return res

In [None]:
def linear_regression(xi):
    return w0 + dot(xi, w)

In [None]:
w_new = [w0] + w

In [None]:
w_new

In [None]:
def linear_regression(xi):
    xi = [1] + xi
    return dot(xi, w_new)

In [None]:
linear_regression(xi)

In [None]:
# something doesn't work above, missing w values in dot

In [None]:
xi = [453, 11, 86]
w0 = 7.17
w = [0.01, 0.04, 0.002]
w_new = [w0] + w

In [None]:
x1 = [1, 148, 24, 1384]
x2 = [1, 123, 25, 2031]
x10 = [1, 453, 11, 86]

X = [x1, x2, x10]
X = np.array(X)

X.dot(w_new)