## Introduction

### import packages

In [1]:
import pandas as pd
import wget 
import numpy as np

### Q1. Print pandas version

In [3]:
print(f"Pandas version is: {pd.__version__}")

Pandas version is: 2.2.3


### Q2. Read in data needed

In [5]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
filename = wget.download(url)
print(f"Downloaded file: {filename}")

Downloaded file: car_fuel_efficiency.csv


In [6]:
df = pd.read_csv('car_fuel_efficiency.csv')

### Preview the data

In [7]:
df.head(10)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
5,190,3.0,,2484.883986,14.7,2008,Europe,Gasoline,All-wheel drive,-1.0,17.271818
6,240,7.0,127.0,3006.542287,22.2,2012,USA,Gasoline,Front-wheel drive,1.0,13.210412
7,150,4.0,239.0,3638.65778,17.3,2020,USA,Diesel,All-wheel drive,1.0,12.848884
8,250,1.0,174.0,2714.21931,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
9,150,4.0,123.0,3509.036569,10.2,2005,USA,Gasoline,Front-wheel drive,-1.0,12.298355


In [8]:
df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [10]:
print(f"There are {len(df)} rows and {len(df.columns)} columns in the dataset.")

There are 9704 rows and 11 columns in the dataset.


### Q3. Fuel types breakdown

In [11]:
df['fuel_type'].value_counts()

fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64

In [12]:
print(f"There are {df['fuel_type'].nunique()} unique fuel types in the dataset.")

There are 2 unique fuel types in the dataset.


### Q4. Missing value breakdown

In [13]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [23]:
df.isnull().sum()>0

engine_displacement    False
num_cylinders           True
horsepower              True
vehicle_weight         False
acceleration            True
model_year             False
origin                 False
fuel_type              False
drivetrain             False
num_doors               True
fuel_efficiency_mpg    False
dtype: bool

### Q5. Get max value of efficiency of Asian Cars

In [None]:
df.groupby('origin')['fuel_efficiency_mpg'].max().reset_index()

Unnamed: 0,origin,fuel_efficiency_mpg
0,Asia,23.759123
1,Europe,25.967222
2,USA,24.971452


### Q6. Get the median value of horsepower

In [30]:
df['horsepower'].median()

149.0

### Get most common value of horsepower

In [36]:
df['horsepower'].mode()

0    152.0
Name: horsepower, dtype: float64

In [37]:
## get mode value 
hp_mode = df['horsepower'].mode()[0]

In [38]:
## replace nas with mode value
df['horsepower'] = df['horsepower'].fillna(hp_mode)

In [39]:
## recalculate median value now that NAs have been filled
df['horsepower'].median()

152.0

In [43]:
## select all cars from asia and get only vehicle_weight and model_year
asia_cars_df = df[df['origin']=='Asia'][['vehicle_weight', 'model_year']]

In [45]:
## get only the first 7 records of the asia_cars_df
asia_cars_df.head(7)

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


### Q7. Sum of Weights - Matrix Multiplication

In [47]:
## get the underlying numpy array of the asia_cars_df
underlying_array = asia_cars_df.head(7).to_numpy()

In [48]:
## Compute matrix-matrix multiplication between the transpose of X and X
result = np.dot(underlying_array.T, underlying_array)

In [49]:
result

array([[62248334.33150762, 41431216.5073268 ],
       [41431216.5073268 , 28373339.        ]])

In [50]:
## now invert result 
np.linalg.inv(result)

array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

In [51]:
## create teset array with these values 1100, 1300, 800, 900, 1000, 1100, 1200] 
test_array = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

In [52]:
test_array

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [53]:
## Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w 
w = np.linalg.inv(result).dot(underlying_array.T).dot(test_array)

In [54]:
w

array([0.01386421, 0.5049067 ])

In [55]:
np.sum(w)

np.float64(0.5187709081074025)