# Setting up the Environment

## Check python version

In [7]:
!python3 --version

Python 3.9.13


## Install python modules 

In [9]:
## Install numpy, pandas and scikit-learn (latest with py-3.9)
!pip install numpy pandas scikit-learn seaborn



## Import modules 

In [10]:
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

***

## Question 1
What's the version of NumPy that you installed?

In [13]:
np.__version__

'1.21.6'

## Getting the data 

In [15]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2022-09-12 15:41:45--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2022-09-12 15:41:45 (19.8 MB/s) - ‘data.csv’ saved [1475504/1475504]



In [16]:
!mv data.csv data/

***

## Question 2

How many records are in the dataset?

In [18]:
df = pd.read_csv('data/data.csv')
df.shape

(11914, 16)

***

## Question 3

Who are the most frequent car manufacturers (top-3) according to the dataset?

In [20]:
df.head(2)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650


In [24]:
df['Make'].value_counts(ascending=False).head(3)

Chevrolet     1123
Ford           881
Volkswagen     809
Name: Make, dtype: int64

***

## Question 4

What's the number of unique Audi car models in the dataset?

In [27]:
df_audi = df[df['Make'] == 'Audi']

In [28]:
df_audi['Model'].nunique()

34

In [29]:
df_audi['Model'].unique()

array(['100', '200', '80', '90', 'A3', 'A4 allroad', 'A4', 'A5', 'A6',
       'A7', 'A8', 'allroad quattro', 'allroad', 'Cabriolet', 'Coupe',
       'Q3', 'Q5', 'Q7', 'R8', 'RS 4', 'RS 5', 'RS 6', 'RS 7', 'S3', 'S4',
       'S5', 'S6', 'S7', 'S8', 'SQ5', 'TT RS', 'TT', 'TTS', 'V8'],
      dtype=object)

****

## Question 5

How many columns in the dataset have missing values?

In [36]:
df_missingvalues = df.isnull().sum() 
df_missingvalues

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

In [39]:
df_nmissing = df_missingvalues[df_missingvalues != 0]
df_nmissing.count()

5

***

## Question 6

1. Find the median value of "Engine Cylinders" column in the dataset.

In [41]:
df['Engine Cylinders'].mean()

5.628828677213059

2. Next, calculate the most frequent value of the same "Engine Cylinders".

In [47]:
most_freq_engine_cyc = df['Engine Cylinders'].value_counts(ascending=False).head(1).index[0]
most_freq_engine_cyc

4.0

3. Use the fillna method to fill the missing values in "Engine Cylinders" with the most frequent value from the previous step.

In [49]:
df['Engine Cylinders'].isnull().sum()

30

In [50]:
df['Engine Cylinders'] = df['Engine Cylinders'].fillna(most_freq_engine_cyc)

In [51]:
df['Engine Cylinders'].isnull().sum()

0

In [None]:
4. Now, calculate the median value of "Engine Cylinders" once again.

In [52]:
df['Engine Cylinders'].mean()

5.624727211683734

***

## Question 7

1. Select all the "Lotus" cars from the dataset.

In [96]:
df.head(2)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650


In [97]:
df_lotus = df[df['Make'] == 'Lotus']
df_lotus.head(2)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
3912,Lotus,Elise,2009,premium unleaded (required),189.0,4.0,MANUAL,rear wheel drive,2.0,"Exotic,High-Performance",Compact,Convertible,27,21,613,43995
3913,Lotus,Elise,2009,premium unleaded (required),218.0,4.0,MANUAL,rear wheel drive,2.0,"Exotic,High-Performance",Compact,Convertible,26,20,613,54990


In [98]:
df_lotus.shape

(29, 16)

2. Select only columns "Engine HP", "Engine Cylinders".

In [99]:
df_lotus_2 = df_lotus[["Engine HP", "Engine Cylinders"]]
df_lotus_2

Unnamed: 0,Engine HP,Engine Cylinders
3912,189.0,4.0
3913,218.0,4.0
3914,189.0,4.0
3915,189.0,4.0
3916,218.0,4.0
3917,189.0,4.0
3918,217.0,4.0
3919,217.0,4.0
4216,350.0,8.0
4217,350.0,8.0


3. Now drop all duplicated rows using drop_duplicates method (you should get a dataframe with 9 rows).

In [100]:
df_lotus_3 = df_lotus_2.drop_duplicates()
df_lotus_3.shape

(9, 2)

4. Get the underlying NumPy array. Let's call it X.

In [101]:
X = df_lotus_3.to_numpy()
X

array([[189.,   4.],
       [218.,   4.],
       [217.,   4.],
       [350.,   8.],
       [400.,   6.],
       [276.,   6.],
       [345.,   6.],
       [257.,   4.],
       [240.,   4.]])

5. Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.

In [104]:
XTX = np.matmul(X.T, X)
XTX

array([[7.31684e+05, 1.34100e+04],
       [1.34100e+04, 2.52000e+02]])

6. Compute the inverse of XTX.

In [111]:
from numpy.linalg import inv

iXTX = inv(XTX)
iXTX

array([[ 5.53084235e-05, -2.94319825e-03],
       [-2.94319825e-03,  1.60588447e-01]])

7. Create an array y with values [1100, 800, 750, 850, 1300, 1000, 1000, 1300, 800].

In [112]:
y = [1100, 800, 750, 850, 1300, 1000, 1000, 1300, 800]

8. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.

In [114]:
w = np.matmul(np.matmul(iXTX, X.T), y)
w

array([  4.59494481, -63.56432501])

9. What's the value of the first element of w?

In [115]:
w[0]

4.594944810094551