# Using Pandas

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 200)
## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<b>load the data from the vehicles.csv file into pandas data frame

In [2]:
## Your Code here
vehicles_df = pd.read_csv('data/vehicles.csv')

First exploration of the dataset:

- How many observations does it have?
- Look at all the columns: do you understand what they mean?
- Look at the raw data: do you see anything weird?
- Look at the data types: are they the expected ones for the information the column contains?

In [3]:
vehicles_df.head(3)
print(vehicles_df.info())
print(f"\nUniquie 'Make': {vehicles_df['Make'].unique()}")
print(f"\nUniquie 'Model': {vehicles_df['Model'].unique()}") #too many values; adding .tolist() shows all
print(vehicles_df['Model'].describe()) #too many indeed (3608)
print(f"\nUnique and sorted 'Year': {sorted(vehicles_df['Year'].unique())}")
print(f"\nUniquie and sorted 'Engine Displacement': {sorted(vehicles_df['Engine Displacement'].unique())}")
print(f"\nUnique and sorted 'Cylinders': {sorted(vehicles_df['Cylinders'].unique())}")
print(f"\nUniquie 'Transmission': {vehicles_df['Transmission'].unique()}")
print(f"\nUniquie 'Drivetrain': {vehicles_df['Drivetrain'].unique()}")
print(f"\nUniquie 'Vehicle Class': {vehicles_df['Vehicle Class'].unique()}")
print(f"\nUniquie 'Fuel Type': {vehicles_df['Fuel Type'].unique()}")
print(f"\nUnique and sorted 'Fuel Barrels/Year': {sorted(vehicles_df['Fuel Barrels/Year'].unique())}")
print(f"\nUnique and sorted 'City MPG': {sorted(vehicles_df['City MPG'].unique())}")
print(f"\nUnique and sorted 'Highway MPG': {sorted(vehicles_df['Highway MPG'].unique())}")
print(f"\nUnique and sorted 'Combined MPG': {sorted(vehicles_df['Combined MPG'].unique())}")
print(f"\nUnique and sorted 'CO2 Emisson Grams/Mile': {sorted(vehicles_df['CO2 Emission Grams/Mile'].unique())}")
print(f"\nUnique and sorted 'Fuel Cost/Year': {sorted(vehicles_df['Fuel Cost/Year'].unique())}\n")
print(vehicles_df[vehicles_df.duplicated(keep=False) == True]) #df has no duplicates

#35952 rows and 15 columns, no null entries, data types seem adequate

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Make                     35952 non-null  object 
 1   Model                    35952 non-null  object 
 2   Year                     35952 non-null  int64  
 3   Engine Displacement      35952 non-null  float64
 4   Cylinders                35952 non-null  float64
 5   Transmission             35952 non-null  object 
 6   Drivetrain               35952 non-null  object 
 7   Vehicle Class            35952 non-null  object 
 8   Fuel Type                35952 non-null  object 
 9   Fuel Barrels/Year        35952 non-null  float64
 10  City MPG                 35952 non-null  int64  
 11  Highway MPG              35952 non-null  int64  
 12  Combined MPG             35952 non-null  int64  
 13  CO2 Emission Grams/Mile  35952 non-null  float64
 14  Fuel Cost/Year        

### Cleaning and wrangling data

- Some car brand names refer to the same brand. Replace all brand names that contain the word "Dutton" for simply "Dutton". If you find similar examples, clean their names too. Use `loc` with boolean indexing.

- Convert CO2 Emissions from Grams/Mile to Grams/Km

- Create a binary column that solely indicates if the transmission of a car is automatic or manual. Use `pandas.Series.str.startswith` and .

- convert MPG columns to km_per_liter

Note:
<br>Converting Grams/Mile to Grams/Km

1 Mile = 1.60934 Km

Converting Gallons to Liters

1 Gallon = 3.78541 Liters



In [4]:
#Replace all brand names that contain the word "Dutton" for simply "Dutton"

vehicles_df.loc[vehicles_df['Make'].str.contains('Dutton'), 'Make'] = 'Dutton'
vehicles_df[vehicles_df['Make'].str.contains('Dutton')]

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
11012,Dutton,Funeral Coach,1985,4.1,8.0,Automatic 4-spd,Front-Wheel Drive,Special Purpose Vehicles,Regular,19.388824,15,21,17,522.764706,1950
30164,Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,9,11,10,888.7,3350
31754,Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,10,11,10,888.7,3350


In [5]:
#If you find similar examples, clean their names too.

vehicles_df.loc[vehicles_df['Make'].str.contains('GMC'), 'Make'] = 'GMC'
vehicles_df.loc[vehicles_df['Make'].str.contains('BMW'), 'Make'] = 'BMW'
vehicles_df.loc[vehicles_df['Make'].str.contains('Saleen'), 'Make'] = 'Saleen'

In [6]:
#Convert CO2 Emissions from Grams/Mile to Grams/Km

print(vehicles_df.loc[0, 'CO2 Emission Grams/Mile'])
vehicles_df['CO2 Emission Grams/Mile'] *= 0.621371
print(vehicles_df.loc[0, 'CO2 Emission Grams/Mile'])

522.7647058823529
324.8308280588235


In [7]:
#Create a binary column that solely indicates if the transmission of a car is automatic or manual.

new_col = np.where(vehicles_df['Transmission'].str.startswith('Auto'), 1, 0)
vehicles_df.insert(6, 'binary', new_col)
vehicles_df.head(3)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,binary,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,324.830828,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.778775,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,1,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,345.132755,2100


In [8]:
#convert MPG columns to km_per_liter

vehicles_df['City MPG'] *= 0.425144
vehicles_df['Highway MPG'] *= 0.425144
vehicles_df['Combined MPG'] *= 0.425144
vehicles_df.head(3)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,binary,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,7.652592,7.227448,7.227448,324.830828,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.526872,5.526872,5.526872,424.778775,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,1,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,6.802304,7.227448,6.802304,345.132755,2100


In [9]:
vehicles_df.rename(columns = {'City MPG':'City KPL', 
                              'Highway MPG':'Highway KPL', 
                              'Combined MPG':'Combined KPL', 
                              'CO2 Emission Grams/Mile':'CO2 Emission Grams/Km'},
                  inplace=True)
vehicles_df.head(2)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,binary,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City KPL,Highway KPL,Combined KPL,CO2 Emission Grams/Km,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,7.652592,7.227448,7.227448,324.830828,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.526872,5.526872,5.526872,424.778775,2550


### Gathering insights:

- How many car makers are there? How many models? Which car maker has the most cars in the dataset?

- When were these cars made? How big is the engine of these cars?

- What's the frequency of different transmissions, drivetrains and fuel types?

- What's the car that consumes the least/most fuel?

In [10]:
# Your Code here
print(f"\nCar makers: {vehicles_df['Make'].nunique()}")
print(f"\nModels: {vehicles_df['Model'].nunique()}")
print(f"\nCar maker with the most cars: {vehicles_df['Make'].mode()}") #mode
print(f"\nCar maker with the most cars: {vehicles_df['Make'].value_counts().idxmax()}") #value_counts
print(f"\nCars were made between {vehicles_df['Year'].min()} and {vehicles_df['Year'].max()}")
print(f"\nEngine displacement varies between {vehicles_df['Engine Displacement'].min()} and {vehicles_df['Engine Displacement'].max()}")
print(f"\nTransmission frequency:\n{vehicles_df['Transmission'].value_counts()}")
print(f"\nDrivetrain frequency:\n{vehicles_df['Drivetrain'].value_counts()}")
print(f"\nFuel type frequency:\n{vehicles_df['Fuel Type'].value_counts()}")
print(f"\nCar with highest fuel barrels/year:\n{vehicles_df.loc[vehicles_df['Fuel Barrels/Year'].idxmax()]}")



Car makers: 122

Models: 3608

Car maker with the most cars: 0    Chevrolet
Name: Make, dtype: object

Car maker with the most cars: Chevrolet

Cars were made between 1984 and 2017

Engine displacement varies between 0.6 and 8.4

Transmission frequency:
Automatic 4-spd                     10585
Manual 5-spd                         7787
Automatic (S6)                       2631
Automatic 3-spd                      2597
Manual 6-spd                         2423
Automatic 5-spd                      2171
Automatic 6-spd                      1432
Manual 4-spd                         1306
Automatic (S8)                        960
Automatic (S5)                        822
Automatic (variable gear ratios)      675
Automatic 7-spd                       662
Automatic (S7)                        261
Auto(AM-S7)                           256
Automatic 8-spd                       243
Automatic (S4)                        229
Auto(AM7)                             157
Auto(AV-S6)                    

<b> (Optional)

What brand has the worse CO2 Emissions on average?

Hint: use the function `sort_values()`

In [11]:
#vehicles_df.sort_values(by=['CO2 Emission Grams/Km'])
vehicles_df.loc[vehicles_df['CO2 Emission Grams/Km'].idxmax()]

Make                          Lamborghini
Model                            Countach
Year                                 1986
Engine Displacement                   5.2
Cylinders                            12.0
Transmission                 Manual 5-spd
binary                                  0
Drivetrain               Rear-Wheel Drive
Vehicle Class                 Two Seaters
Fuel Type                         Premium
Fuel Barrels/Year               47.087143
City KPL                         2.550864
Highway KPL                       4.25144
Combined KPL                     2.976008
CO2 Emission Grams/Km          788.874868
Fuel Cost/Year                       5800
Name: 20894, dtype: object

Do cars with automatic transmission consume more fuel than cars with manual transmission on average?

In [12]:
automatic = vehicles_df.query('binary == 1')['Fuel Barrels/Year']
manual = vehicles_df.query('binary == 0')['Fuel Barrels/Year']

np.mean(automatic) > np.mean(manual)
print(np.mean(automatic))
print(np.mean(manual))


True

18.04315227297736
16.704904365383218


**round columns with float**

In [14]:
vehicles_df.round({'Fuel Barrels/Year': 2})

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,binary,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City KPL,Highway KPL,Combined KPL,CO2 Emission Grams/Km,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.39,7.652592,7.227448,7.227448,324.830828,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,1,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.35,5.526872,5.526872,5.526872,424.778775,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,1,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.60,6.802304,7.227448,6.802304,345.132755,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,1,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.35,5.526872,5.526872,5.526872,424.778775,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,1,Rear-Wheel Drive,Midsize Cars,Premium,20.60,5.952016,8.928024,6.802304,345.132755,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),1,Rear-Wheel Drive,Two Seaters,Premium,9.16,14.454896,16.155472,15.305184,151.614524,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),1,Rear-Wheel Drive,Two Seaters,Premium,9.16,14.454896,16.155472,15.305184,150.993153,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),1,Rear-Wheel Drive,Two Seaters,Premium,9.16,14.454896,16.155472,15.305184,151.614524,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),1,Rear-Wheel Drive,Two Seaters,Premium,9.16,14.454896,16.580616,15.305184,152.857266,1100
