# Data Manipulation Part 2 - Quick Review

In [1]:
import numpy as np

## Mask

In [17]:
x = np.random.randint(0, 50, size=(4,4))

In [4]:
x

array([[16, 39, 15, 33],
       [ 9, 23,  6, 40],
       [14, 20, 38, 41],
       [29, 49,  2, 12]])

In [5]:
x.shape

(4, 4)

In [9]:
x[3,3]

12

In [13]:
x_mean = x.mean()
x_mean

24.125

In [15]:
for row in range(x.shape[0]):
    for col in range(x.shape[1]):
        if x[row, col] < x_mean:
            x[row, col] = 0


In [18]:
x = np.random.randint(0, 50, size=(4,4))
x

In [20]:
x_mean = x.mean()

In [22]:
x_mean

19.375

In [24]:
x * 5

array([[ 10,  80,  15, 145],
       [110, 110, 240,  15],
       [205,  35,  75, 160],
       [140,  95,  55,  60]])

In [25]:
mask = (x < x_mean)

mask

array([[ True,  True,  True, False],
       [False, False, False,  True],
       [False,  True,  True, False],
       [False,  True,  True,  True]])

In [27]:
x[mask] = 0

In [28]:
x

array([[ 0,  0,  0, 29],
       [22, 22, 48,  0],
       [41,  0,  0, 32],
       [28,  0,  0,  0]])

In [29]:
import pandas as pd

In [30]:
vehicles_review = pd.read_csv('vehicles/vehicles.csv')

In [31]:
vehicles_review.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


## Group by


Average Engine Displacement for each number of Cylinder.

In [32]:
vehicles_review.head(2)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [34]:
vehicles_review.groupby(by='Cylinders').mean()[['Engine Displacement','Combined MPG']]

Unnamed: 0_level_0,Engine Displacement,Combined MPG
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,1.239583,19.104167
3.0,1.052239,36.572139
4.0,2.06657,24.075441
5.0,2.636653,20.334716
6.0,3.439342,18.606189
8.0,5.222581,15.206302
10.0,5.911765,13.941176
12.0,5.907473,13.014235
16.0,8.0,10.0


In [35]:
vehicles_review.groupby(by='Cylinders').mean()[['Engine Displacement','Combined MPG']].reset_index()

Unnamed: 0,Cylinders,Engine Displacement,Combined MPG
0,2.0,1.239583,19.104167
1,3.0,1.052239,36.572139
2,4.0,2.06657,24.075441
3,5.0,2.636653,20.334716
4,6.0,3.439342,18.606189
5,8.0,5.222581,15.206302
6,10.0,5.911765,13.941176
7,12.0,5.907473,13.014235
8,16.0,8.0,10.0


Min, Max, Average values of CO2 Emission Grams/Mile per year per Fuel Type.

In [36]:
vehicles_review.groupby(by=['Year','Fuel Type']).agg(['min','max','mean'])['CO2 Emission Grams/Mile']

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,mean
Year,Fuel Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1984,Diesel,328.387097,636.250000,493.678441
1984,Regular,329.148148,987.444444,582.790195
1985,Diesel,254.500000,678.666667,450.229221
1985,Premium,370.291667,740.583333,468.181967
1985,Regular,206.674419,1110.875000,492.396110
...,...,...,...,...
2017,Premium and Electricity,189.000000,248.000000,222.250000
2017,Premium or E85,344.000000,457.000000,385.400000
2017,Regular,158.000000,716.000000,384.184697
2017,Regular Gas and Electricity,104.000000,112.000000,108.000000


In [41]:
vehicles_review.groupby(by=['Fuel Type', 'Year']).agg(['min','max','mean'])['CO2 Emission Grams/Mile'].loc['Regular',:].style.format("{:.2f}").highlight_max()

Unnamed: 0_level_0,min,max,mean
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1984,329.15,987.44,582.79
1985,206.67,1110.88,492.4
1986,185.15,1110.88,492.35
1987,189.09,1110.88,494.72
1988,189.09,987.44,490.02
1989,189.09,987.44,493.92
1990,189.09,987.44,492.94
1991,189.09,888.7,495.34
1992,189.09,888.7,492.21
1993,189.09,888.7,483.65


In [46]:
vehicles_review.groupby(by=['Year','Fuel Type']).agg(min_emission='min',
                                                     max_emission='max',
                                                     mean_emission='mean')

TypeError: Must provide 'func' or tuples of '(column, aggfunc).

In [47]:
vehicles_review.groupby(by=['Year','Fuel Type'])['CO2 Emission Grams/Mile']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x11cc70970>

In [None]:
#named aggregation
vehicles_review.groupby(by=['Year','Fuel Type'])['CO2 Emission Grams/Mile'].agg(min_emission='min',
                                                                                 max_emission='max',
                                                                                 mean_emission='mean')

In [55]:
#named aggregation
grouped_results = vehicles_review.groupby(by=['Year','Fuel Type']).agg(min_emission=('CO2 Emission Grams/Mile', 'min'),
                                                     max_emission=('CO2 Emission Grams/Mile', 'max'),
                                                     max_cylinders =('Cylinders', 'max'), )

grouped_results

Unnamed: 0_level_0,Unnamed: 1_level_0,min_emission,max_emission,max_cylinders
Year,Fuel Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1984,Diesel,328.387097,636.250000,8.0
1984,Regular,329.148148,987.444444,8.0
1985,Diesel,254.500000,678.666667,8.0
1985,Premium,370.291667,740.583333,8.0
1985,Regular,206.674419,1110.875000,12.0
...,...,...,...,...
2017,Premium and Electricity,189.000000,248.000000,4.0
2017,Premium or E85,344.000000,457.000000,6.0
2017,Regular,158.000000,716.000000,8.0
2017,Regular Gas and Electricity,104.000000,112.000000,4.0


## Pandas Merge

Bring those two informations back.

In [54]:
vehicles_review.head(2)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [56]:
grouped_results.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,min_emission,max_emission,max_cylinders
Year,Fuel Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1984,Diesel,328.387097,636.25,8.0
1984,Regular,329.148148,987.444444,8.0


In [57]:
vehicles_review = pd.merge(left=vehicles_review, right=grouped_results, on=['Year','Fuel Type'], how='inner')

In [58]:
vehicles_review.head(2)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,min_emission,max_emission,max_cylinders
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,329.148148,987.444444,8.0
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,329.148148,987.444444,8.0


## Concatenate dataframes

In [59]:
amostra1 = vehicles_review.sample(3)

amostra2 = vehicles_review.sample(3)
amostra2

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,min_emission,max_emission,max_cylinders
11491,Ford,Aerostar Wagon,1990,4.0,6.0,Automatic 4-spd,Rear-Wheel Drive,Vans,Regular,20.600625,15,20,16,555.4375,2100,189.085106,987.444444,12.0
6540,Chevrolet,Monte Carlo,2005,3.4,6.0,Automatic 4-spd,Front-Wheel Drive,Midsize Cars,Regular,14.982273,19,29,22,403.954545,1500,170.903846,807.909091,8.0
28484,Pontiac,G5 GT,2010,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,12.677308,23,32,26,341.807692,1300,177.74,634.785714,8.0


In [60]:
amostra1

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,min_emission,max_emission,max_cylinders
9522,Dodge,Conquest,1985,2.6,4.0,Automatic 4-spd,Rear-Wheel Drive,Subcompact Cars,Premium,17.347895,17,22,19,467.736842,2150,370.291667,740.583333,8.0
25775,Mitsubishi,Nativa 2WD(Puerto Rico Only),2003,3.0,6.0,Automatic 4-spd,Rear-Wheel Drive,Sport Utility Vehicle - 2WD,Regular,18.311667,16,21,18,493.722222,1850,167.679245,683.615385,8.0
26079,Nissan,300ZX,1986,3.0,6.0,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,17.347895,16,23,19,467.736842,1750,185.145833,1110.875,12.0


In [66]:
pd.concat([amostra1, amostra2], ).reset_index(drop=True)

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,min_emission,max_emission,max_cylinders
0,Dodge,Conquest,1985,2.6,4.0,Automatic 4-spd,Rear-Wheel Drive,Subcompact Cars,Premium,17.347895,17,22,19,467.736842,2150,370.291667,740.583333,8.0
1,Mitsubishi,Nativa 2WD(Puerto Rico Only),2003,3.0,6.0,Automatic 4-spd,Rear-Wheel Drive,Sport Utility Vehicle - 2WD,Regular,18.311667,16,21,18,493.722222,1850,167.679245,683.615385,8.0
2,Nissan,300ZX,1986,3.0,6.0,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,17.347895,16,23,19,467.736842,1750,185.145833,1110.875,12.0
3,Ford,Aerostar Wagon,1990,4.0,6.0,Automatic 4-spd,Rear-Wheel Drive,Vans,Regular,20.600625,15,20,16,555.4375,2100,189.085106,987.444444,12.0
4,Chevrolet,Monte Carlo,2005,3.4,6.0,Automatic 4-spd,Front-Wheel Drive,Midsize Cars,Regular,14.982273,19,29,22,403.954545,1500,170.903846,807.909091,8.0
5,Pontiac,G5 GT,2010,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,12.677308,23,32,26,341.807692,1300,177.74,634.785714,8.0


# Data Cleaning
> Objectives:
    - Given a messy dataset - extract meaningful information from it. 
    - Learn from the data.

In [None]:
import pandas as pd
import numpy as np

In [68]:
vehicles = pd.read_csv('vehicles/vehicles_messy.csv', low_memory=False)

In [69]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37843 entries, 0 to 37842
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        37843 non-null  float64
 1   barrelsA08       37843 non-null  float64
 2   charge120        37843 non-null  float64
 3   charge240        37843 non-null  float64
 4   city08           37843 non-null  int64  
 5   city08U          37843 non-null  float64
 6   cityA08          37843 non-null  int64  
 7   cityA08U         37843 non-null  float64
 8   cityCD           37843 non-null  float64
 9   cityE            37843 non-null  float64
 10  cityUF           37843 non-null  float64
 11  co2              37843 non-null  int64  
 12  co2A             37843 non-null  int64  
 13  co2TailpipeAGpm  37843 non-null  float64
 14  co2TailpipeGpm   37843 non-null  float64
 15  comb08           37843 non-null  int64  
 16  comb08U          37843 non-null  float64
 17  combA08     

In [70]:
vehicles.shape

(37843, 83)

In [71]:
vehicles.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [72]:
# display more columns on the dataframe visualization.
pd.options.display.max_columns = 99
# usar com parcimonia.

In [73]:
vehicles.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0,0.0,47.0,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0,0.0,32.0,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [74]:
# create a copy of our dataset just for backup.
vehicles_bkp = vehicles.copy()

In [79]:
# Describing our dataset.
vehicles.describe()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,engId,feScore,fuelCost08,fuelCostA08,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,UCity,UCityA,UHighway,UHighwayA,year,youSaveSpend,charge240b,phevCity,phevHwy,phevComb
count,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37720.0,37723.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0,37843.0
mean,17.532506,0.216169,0.0,0.023531,17.941389,4.042737,0.520149,0.327163,0.000406,0.18479,0.000706,61.503713,5.349919,17.771433,473.179736,20.195809,4.549751,0.581191,0.359027,0.190575,0.000335,0.000691,5.73799,3.31818,8860.308961,-0.035145,1882.06009,79.070105,-0.037206,-0.92963,24.104881,5.438467,0.686891,0.418698,0.000233,0.198247,0.000673,2.026214,10.424332,19019.286235,1.849219,6.165658,13.696113,33.769125,0.392675,0.352618,0.0368,0.343399,0.033475,22.587229,0.65238,33.619221,0.933845,2000.064398,-2658.999022,0.00436,0.069313,0.068203,0.068573
std,4.57595,1.141527,0.0,0.427647,6.66036,9.64582,3.837874,3.542596,0.039918,2.904558,0.019458,153.387715,55.539497,94.129283,122.188847,6.623444,10.389994,3.966255,3.58926,2.979567,0.035284,0.019049,1.751795,1.361399,17829.683477,2.379584,510.280408,417.66858,2.374338,0.627565,6.963192,11.936327,4.308934,3.808877,0.029564,3.091862,0.018583,5.947989,28.14877,11034.784855,4.429983,9.743297,31.26993,45.914462,8.251191,8.049082,1.15827,8.171939,1.032642,9.350163,5.284547,10.048326,6.059456,10.390588,2553.098329,0.142776,1.966806,1.871986,1.913647
min,0.06,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,-1.0,550.0,0.0,-1.0,-1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1984.0,-22250.0,0.0,0.0,0.0,0.0
25%,14.33087,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,388.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.2,0.0,-1.0,1500.0,0.0,-1.0,-1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9461.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,27.1,0.0,1990.0,-4250.0,0.0,0.0,0.0,0.0
50%,17.347895,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,467.736842,19.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,211.0,-1.0,1850.0,0.0,-1.0,-1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18923.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,33.0,0.0,2001.0,-2500.0,0.0,0.0,0.0,0.0
75%,20.600625,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,555.4375,23.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,4.3,4505.0,-1.0,2200.0,0.0,-1.0,-1.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28570.5,0.0,13.0,0.0,91.0,0.0,0.0,0.0,0.0,0.0,25.1393,0.0,38.1096,0.0,2009.0,-750.0,0.0,0.0,0.0,0.0
max,47.087143,18.311667,0.0,12.0,138.0,138.304,127.0,127.093,5.35,122.0,0.849,847.0,719.0,719.0,1269.571429,124.0,124.3601,117.0,116.9364,121.0,4.8,0.834,16.0,8.4,69102.0,10.0,5800.0,3800.0,10.0,8.0,111.0,111.37,107.0,106.531,4.06,120.0,0.813,49.0,195.0,38173.0,41.0,55.0,194.0,192.0,315.0,305.9,77.504,346.9,65.632,197.5771,181.5609,159.1,152.1878,2017.0,4000.0,7.0,97.0,79.0,88.0


In [81]:
vehicles.describe().loc[['mean','std'], :'cityCD']

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD
mean,17.532506,0.216169,0.0,0.023531,17.941389,4.042737,0.520149,0.327163,0.000406
std,4.57595,1.141527,0.0,0.427647,6.66036,9.64582,3.837874,3.542596,0.039918


# Data types of our dataset

In [82]:
vehicles.dtypes

barrels08     float64
barrelsA08    float64
charge120     float64
charge240     float64
city08          int64
               ...   
modifiedOn     object
startStop      object
phevCity        int64
phevHwy         int64
phevComb        int64
Length: 83, dtype: object

In [84]:
vehicles.dtypes == 'float64'

barrels08      True
barrelsA08     True
charge120      True
charge240      True
city08        False
              ...  
modifiedOn    False
startStop     False
phevCity      False
phevHwy       False
phevComb      False
Length: 83, dtype: bool

In [86]:
vehicles.dtypes == 'float64'

barrels08      True
barrelsA08     True
charge120      True
charge240      True
city08        False
              ...  
modifiedOn    False
startStop     False
phevCity      False
phevHwy       False
phevComb      False
Length: 83, dtype: bool

In [109]:
mask = (vehicles.dtypes == 'object')

In [110]:
selected_columns = vehicles.dtypes.loc[mask].index
selected_columns

Index(['drive', 'eng_dscr', 'fuelType', 'fuelType1', 'make', 'model',
       'mpgData', 'trany', 'VClass', 'guzzler', 'trans_dscr', 'tCharger',
       'sCharger', 'atvType', 'fuelType2', 'rangeA', 'evMotor', 'mfrCode',
       'c240Dscr', 'c240bDscr', 'createdOn', 'modifiedOn', 'startStop'],
      dtype='object')

In [111]:
vehicles.loc[:, selected_columns]

Unnamed: 0,drive,eng_dscr,fuelType,fuelType1,make,model,mpgData,trany,VClass,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,c240bDscr,createdOn,modifiedOn,startStop
0,Rear-Wheel Drive,(FFS),Regular,Regular Gasoline,Alfa Romeo,Spider Veloce 2000,Y,Manual 5-spd,Two Seaters,,,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
1,Rear-Wheel Drive,(GUZZLER),Regular,Regular Gasoline,Ferrari,Testarossa,N,Manual 5-spd,Two Seaters,T,,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
2,Front-Wheel Drive,(FFS),Regular,Regular Gasoline,Dodge,Charger,Y,Manual 5-spd,Subcompact Cars,,SIL,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
3,Rear-Wheel Drive,,Regular,Regular Gasoline,Dodge,B150/B250 Wagon 2WD,N,Automatic 3-spd,Vans,,,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
4,4-Wheel or All-Wheel Drive,"(FFS,TRBO)",Premium,Premium Gasoline,Subaru,Legacy AWD Turbo,N,Manual 5-spd,Compact Cars,,,T,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,Front-Wheel Drive,(FFS),Regular,Regular Gasoline,Subaru,Legacy,N,Automatic 4-spd,Compact Cars,,CLKUP,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
37839,Front-Wheel Drive,(FFS),Regular,Regular Gasoline,Subaru,Legacy,N,Manual 5-spd,Compact Cars,,,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
37840,4-Wheel or All-Wheel Drive,(FFS),Regular,Regular Gasoline,Subaru,Legacy AWD,Y,Automatic 4-spd,Compact Cars,,CLKUP,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,
37841,4-Wheel or All-Wheel Drive,(FFS),Regular,Regular Gasoline,Subaru,Legacy AWD,N,Manual 5-spd,Compact Cars,,,,,,,,,,,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,


## What if I wanted to select only the 'float64' columns?

In [None]:
selected_float_columns = []

for each_column in vehicles.columns:
    if vehicles[each_column].dtypes == 'float64':
        selected_float_columns.append(each_column)
        
vehicles.loc[:, selected_float_columns]

In [None]:
vehicles

In [None]:
## Convert to float ...

In [None]:
vehicles[selected_float_columns][['barrels08']].astype(int)

In [None]:
vehicles.shape

In [101]:
vehicles._get_numeric_data()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,engId,feScore,fuelCost08,fuelCostA08,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,UCity,UCityA,UHighway,UHighwayA,year,youSaveSpend,charge240b,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,9011,-1,1600,0,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,False,0,0,0,0.0,0.0,0.0,0.0,23.3333,0.0,35.0000,0.0,1985,-1250,0.0,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,22020,-1,3050,0,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,False,0,0,0,0.0,0.0,0.0,0.0,11.0000,0.0,19.0000,0.0,1985,-8500,0.0,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,2100,-1,1250,0,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,False,0,0,0,0.0,0.0,0.0,0.0,29.0000,0.0,47.0000,0.0,1985,500,0.0,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,2850,-1,3050,0,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,False,0,0,0,0.0,0.0,0.0,0.0,12.2222,0.0,16.6667,0.0,1985,-8500,0.0,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,66031,-1,2150,0,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,False,0,90,0,0.0,0.0,0.0,0.0,21.0000,0.0,32.0000,0.0,1993,-4000,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,66030,-1,1500,0,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,False,0,90,0,0.0,0.0,0.0,0.0,24.0000,0.0,37.0000,0.0,1993,-750,0.0,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,66030,-1,1450,0,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,False,0,90,0,0.0,0.0,0.0,0.0,25.0000,0.0,39.0000,0.0,1993,-500,0.0,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,66030,-1,1600,0,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,False,0,90,0,0.0,0.0,0.0,0.0,23.0000,0.0,34.0000,0.0,1993,-1250,0.0,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,66030,-1,1600,0,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,False,0,90,0,0.0,0.0,0.0,0.0,23.0000,0.0,34.0000,0.0,1993,-1250,0.0,0,0,0


In [106]:
pd.DataFrame([(1,(2,3))])._get_numeric_data()

Unnamed: 0,0
0,1


# Obtaining some metadata from our dataframe.

In [107]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37843 entries, 0 to 37842
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        37843 non-null  float64
 1   barrelsA08       37843 non-null  float64
 2   charge120        37843 non-null  float64
 3   charge240        37843 non-null  float64
 4   city08           37843 non-null  int64  
 5   city08U          37843 non-null  float64
 6   cityA08          37843 non-null  int64  
 7   cityA08U         37843 non-null  float64
 8   cityCD           37843 non-null  float64
 9   cityE            37843 non-null  float64
 10  cityUF           37843 non-null  float64
 11  co2              37843 non-null  int64  
 12  co2A             37843 non-null  int64  
 13  co2TailpipeAGpm  37843 non-null  float64
 14  co2TailpipeGpm   37843 non-null  float64
 15  comb08           37843 non-null  int64  
 16  comb08U          37843 non-null  float64
 17  combA08     

In [108]:
vehicles._get_bool_data()

Unnamed: 0,phevBlended
0,False
1,False
2,False
3,False
4,False
...,...
37838,False
37839,False
37840,False
37841,False


# Null (or Missing) values

>    - Count 
>    - <b>Mask</b> concept

In [112]:
vehicles.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0,0.0,47.0,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0,0.0,32.0,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [116]:
vehicles.isna()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
37839,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
37840,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
37841,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False


In [117]:
vehicles.isnull()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
37839,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
37840,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False
37841,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,False,False,False


In [120]:
mask = vehicles['c240Dscr'].isna()

In [124]:
vehicles.shape

(37843, 83)

In [125]:
vehicles['c240Dscr'].loc[mask].shape

(37806,)

In [128]:
vehicles['c240Dscr'].loc[~mask]

25683      single charger
25684      single charger
25895      3.6 kW charger
25955      single charger
27142      3.6 kW charger
27226    standard charger
27227    standard charger
27384      3.6 kW charger
28560    standard charger
28575    standard charger
28591    standard charger
28592    standard charger
28600    standard charger
28720    standard charger
29443    standard charger
29444    standard charger
29497      7.2 kW charger
29652    standard charger
29654    standard charger
29750      3.6 kW charger
29751      6.6 kW charger
29933    standard charger
29934    standard charger
29935    standard charger
29936    standard charger
29938    standard charger
29939    standard charger
29941    standard charger
29942    standard charger
29943    standard charger
30141    standard charger
30142    standard charger
30143    standard charger
30972    standard charger
30973    standard charger
30974    standard charger
30975    standard charger
Name: c240Dscr, dtype: object

In [130]:
vehicles['c240Dscr'].isna().sum()

37806

In [134]:
(~vehicles['c240Dscr'].isna()).sum()

37

In [136]:
vehicles['c240Dscr'].isna().mean()

0.9990222762465978

In [139]:
(~vehicles['c240Dscr'].isna()).mean()

0.0009777237534022143

In [140]:
vehicles['c240Dscr'].isna().sum()/vehicles.shape[0]

0.9990222762465978

In [141]:
vehicles['c240Dscr'].isna().mean()

0.9990222762465978

## Let's select the mfrCode column and see how many missing values this column has.

In [142]:
vehicles['mfrCode'].isnull()

0        True
1        True
2        True
3        True
4        True
         ... 
37838    True
37839    True
37840    True
37841    True
37842    True
Name: mfrCode, Length: 37843, dtype: bool

In [143]:
# How to count it?


# How to get the percentage of missing values?

In [144]:
vehicles['mfrCode'].isnull().sum()

30818

In [145]:
vehicles['barrels08'].isnull().sum()

0

In [146]:
vehicles['mfrCode'].isnull().mean()

0.8143646116851201

## What if I wanted to count the number of missing values for each column?

In [150]:
vehicles.isna().mean()

barrels08     0.000000
barrelsA08    0.000000
charge120     0.000000
charge240     0.000000
city08        0.000000
                ...   
modifiedOn    0.000000
startStop     0.837804
phevCity      0.000000
phevHwy       0.000000
phevComb      0.000000
Length: 83, dtype: float64

In [151]:
vehicles.isnull().sum()

barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31705
phevCity          0
phevHwy           0
phevComb          0
Length: 83, dtype: int64

## What if we wanted to count Null values for each row?
    - axis = 1 
    


In [152]:
vehicles.isnull().sum(axis=1)

0        12
1        11
2        11
3        13
4        11
         ..
37838    11
37839    12
37840    11
37841    12
37842    10
Length: 37843, dtype: int64

In [153]:
vehicles.isnull().mean(axis=1)

0        0.144578
1        0.132530
2        0.132530
3        0.156627
4        0.132530
           ...   
37838    0.132530
37839    0.144578
37840    0.132530
37841    0.144578
37842    0.120482
Length: 37843, dtype: float64

## Mask - 2

In [170]:
mask = (vehicles.isna().mean() < 0.8)
mask

barrels08      True
barrelsA08     True
charge120      True
charge240      True
city08         True
              ...  
modifiedOn     True
startStop     False
phevCity       True
phevHwy        True
phevComb       True
Length: 83, dtype: bool

In [173]:
selected_columns = mask.loc[mask].index
selected_columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'trans_dscr',
       'charge240b', 'createdOn', 'modifiedOn', 'phevCity', 'phevHwy',
       'phevComb'],
      dtype='object')

In [162]:
vehicles

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0000,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0000,0.0,19.0000,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,47.0000,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0000,0.0,32.0000,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1500,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0000,0.0,37.0000,0.0,Compact Cars,1993,-750,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0000,0.0,39.0000,0.0,Compact Cars,1993,-500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [167]:
mask.mean()

0.8674698795180723

In [169]:
vehicles.loc[:, ~mask]

Unnamed: 0,guzzler,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,c240bDscr,startStop
0,,,,,,,,,,,
1,T,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,T,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
37838,,,,,,,,,,,
37839,,,,,,,,,,,
37840,,,,,,,,,,,
37841,,,,,,,,,,,


In [176]:
vehicles.isna().any().sum()

17

In [178]:
vehicles.isna().all().sum()

0

In [181]:
vehicles.isna().any(axis=1).mean()

1.0

In [182]:
vehicles.isna().all(axis=1).mean()

0.0

## What if I wanted to select only the rows in which there are at least one missing value (in any column)?

In [183]:
# this will return True if there is any missing in each COLUMN. 
vehicles.isnull().any() 

# How can I check it for each rows? What is the syntax for checking for rows instead of columns?

barrels08     False
barrelsA08    False
charge120     False
charge240     False
city08        False
              ...  
modifiedOn    False
startStop      True
phevCity      False
phevHwy       False
phevComb      False
Length: 83, dtype: bool

In [184]:
vehicles.isnull().any(axis=1)

0        True
1        True
2        True
3        True
4        True
         ... 
37838    True
37839    True
37840    True
37841    True
37842    True
Length: 37843, dtype: bool

# Dropping columns
> `axis=1` or

> `columns = ['name_of_column_to_drop1', 'name_of_column_to_drop2' ,...]`

In [185]:
vehicles['modifiedOn']

0        Tue Jan 01 00:00:00 EST 2013
1        Tue Jan 01 00:00:00 EST 2013
2        Tue Jan 01 00:00:00 EST 2013
3        Tue Jan 01 00:00:00 EST 2013
4        Tue Jan 01 00:00:00 EST 2013
                     ...             
37838    Tue Jan 01 00:00:00 EST 2013
37839    Tue Jan 01 00:00:00 EST 2013
37840    Tue Jan 01 00:00:00 EST 2013
37841    Tue Jan 01 00:00:00 EST 2013
37842    Tue Jan 01 00:00:00 EST 2013
Name: modifiedOn, Length: 37843, dtype: object

In [186]:
vehicles.drop(columns=['modifiedOn'])

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0000,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0000,0.0,19.0000,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,47.0000,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0000,0.0,32.0000,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1500,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0000,0.0,37.0000,0.0,Compact Cars,1993,-750,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0000,0.0,39.0000,0.0,Compact Cars,1993,-500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [187]:
vehicles.drop(columns=['modifiedOn','barrels08'])

Unnamed: 0,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,startStop,phevCity,phevHwy,phevComb
0,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0000,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0000,0.0,19.0000,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,47.0000,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0000,0.0,32.0000,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1500,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0000,0.0,37.0000,0.0,Compact Cars,1993,-750,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0000,0.0,39.0000,0.0,Compact Cars,1993,-500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [202]:
vehicles = vehicles_bkp.copy()

In [195]:
vehicles

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0000,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0000,0.0,19.0000,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,47.0000,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0000,0.0,32.0000,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1500,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0000,0.0,37.0000,0.0,Compact Cars,1993,-750,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0000,0.0,39.0000,0.0,Compact Cars,1993,-500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [222]:
y = vehicles.drop(columns=['modifiedOn','barrels08'])

In [223]:
del y

In [197]:
vehicles.head(2)

Unnamed: 0,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,startStop,phevCity,phevHwy,phevComb
0,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [201]:
vehicles.drop(columns=['modifiedOn','barrels08'], inplace=True)

KeyError: "['modifiedOn' 'barrels08'] not found in axis"

In [192]:
vehicles

Unnamed: 0,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,startStop,phevCity,phevHwy,phevComb
0,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0000,0.0,Two Seaters,1985,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0000,0.0,19.0000,0.0,Two Seaters,1985,-8500,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,47.0000,0.0,Subcompact Cars,1985,500,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0000,0.0,32.0000,0.0,Compact Cars,1993,-4000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1500,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0000,0.0,37.0000,0.0,Compact Cars,1993,-750,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0000,0.0,39.0000,0.0,Compact Cars,1993,-500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [None]:
vehicles.drop('modifiedOn', axis=1)

In [None]:
vehicles.drop(columns='modifiedOn')

In [None]:
vehicles.drop(columns='modifiedOn', inplace=True)

# Drop columns based on condition
>    - Pandas Series

>    - Pandas Indexing

>    - Mask concept

## Lets store the number of missing values in a variable, and based on the number of missing values that column has, lets remove it or not

In [224]:
vehicles.isnull().sum()

barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31705
phevCity          0
phevHwy           0
phevComb          0
Length: 83, dtype: int64

In [225]:
n_missings = vehicles.isnull().sum()

In [226]:
n_missings

barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31705
phevCity          0
phevHwy           0
phevComb          0
Length: 83, dtype: int64

In [None]:
vehicles.shape

### create a condition in which you get True for the indexes that have more than 10000 columns.

In [227]:
n_missings > 10000

barrels08     False
barrelsA08    False
charge120     False
charge240     False
city08        False
              ...  
modifiedOn    False
startStop      True
phevCity      False
phevHwy       False
phevComb      False
Length: 83, dtype: bool

### select from the n_missings pandas Series the ones that returned True.

In [228]:
n_missings.loc[n_missings > 10000]

eng_dscr      15403
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

### get the index of that pandas series.

In [229]:
n_missings.loc[n_missings > 10000].keys()

Index(['eng_dscr', 'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType',
       'fuelType2', 'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'c240bDscr',
       'startStop'],
      dtype='object')

In [230]:
n_missings.loc[n_missings > 10000].index

Index(['eng_dscr', 'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType',
       'fuelType2', 'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'c240bDscr',
       'startStop'],
      dtype='object')

In [231]:
### store it in a variable called `columns_to_drop`, for example

columns_to_drop = n_missings.loc[n_missings > 10000].index

In [232]:
columns_to_drop = list(columns_to_drop)

In [233]:
vehicles.drop(columns=columns_to_drop)


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,charge240b,createdOn,modifiedOn,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,-1,1600,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0000,0.0,Two Seaters,1985,-1250,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,-1,3050,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0000,0.0,19.0000,0.0,Two Seaters,1985,-8500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,47.0000,0.0,Subcompact Cars,1985,500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,-1,3050,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-8500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0000,0.0,32.0000,0.0,Compact Cars,1993,-4000,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,-1,1500,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0000,0.0,37.0000,0.0,Compact Cars,1993,-750,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0000,0.0,39.0000,0.0,Compact Cars,1993,-500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,-1,1600,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,34.0000,0.0,Compact Cars,1993,-1250,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0


# Understand the data (!)
>    - Deep dive into data
>    - <b>Understand the business</b> you're working with
>    - Understand how (and if) you can input those values as more meaningful information
>    - Fill not a number

## Lets get all columns where the displacement is missing.

In [234]:
# passo a passo
vehicles['displ']

0        2.0
1        4.9
2        2.2
3        5.2
4        2.2
        ... 
37838    2.2
37839    2.2
37840    2.2
37841    2.2
37842    2.2
Name: displ, Length: 37843, dtype: float64

In [235]:
# get a mask of which rows are missing
vehicles['displ'].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
37838    False
37839    False
37840    False
37841    False
37842    False
Name: displ, Length: 37843, dtype: bool

In [236]:
# create a new dataframe for those rows where the displacement is missing.

In [237]:
condition = vehicles['displ'].isnull()
vehicles.loc[condition, :]

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
7138,0.240,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,0.0,0,-1,0.0,0.0,85,0.0000,0,0.0,40.0000,0.0,0.0,,,,0,,-1,800,0,Electricity,Electricity,-1,-1,91,0.0000,0,0.0,0.0,37.0000,0.0,0,0,16423,0,0,Nissan,Altra EV,N,False,0,0,90,0.0,0.0,0.0,0.0,,116.2069,0.0,129.6154,0.0,Midsize Station Wagons,2000,2750,,,,,EV,,,62 KW AC Induction,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
7139,0.282,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,0.0,0,-1,0.0,0.0,72,0.0000,0,0.0,47.0000,0.0,0.0,,,2-Wheel Drive,0,,-1,900,0,Electricity,Electricity,-1,-1,64,0.0000,0,0.0,0.0,53.0000,0.0,0,0,16424,0,0,Toyota,RAV4 EV,N,False,0,0,88,0.0,0.0,0.0,0.0,,116.2069,0.0,91.0811,0.0,Sport Utility Vehicle - 2WD,2000,2250,,,,,EV,,,50 KW DC,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
8143,0.282,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,0.0,0,-1,0.0,0.0,72,0.0000,0,0.0,47.0000,0.0,0.0,,,2-Wheel Drive,0,,-1,900,0,Electricity,Electricity,-1,-1,64,0.0000,0,0.0,0.0,53.0000,0.0,0,0,17328,0,0,Toyota,RAV4 EV,N,False,0,0,88,0.0,0.0,0.0,0.0,,116.2069,0.0,91.0811,0.0,Sport Utility Vehicle - 2WD,2001,2250,,,,,EV,,,50 KW DC,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
8144,0.312,0.0,0.0,0.0,74,0.0000,0,0.0,0.0,46.0000,0.0,0,-1,0.0,0.0,65,0.0000,0,0.0,52.0000,0.0,0.0,,,,0,,-1,1000,0,Electricity,Electricity,-1,-1,58,0.0000,0,0.0,0.0,59.0000,0.0,0,0,17329,0,0,Ford,Th!nk,N,False,0,0,29,0.0,0.0,0.0,0.0,,105.3125,0.0,82.1951,0.0,Two Seaters,2001,1750,,,,,EV,,,27 KW AC Induction,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
8146,0.522,0.0,0.0,0.0,45,0.0000,0,0.0,0.0,75.0000,0.0,0,-1,0.0,0.0,39,0.0000,0,0.0,87.0000,0.0,0.0,,,2-Wheel Drive,0,,-1,1700,0,Electricity,Electricity,-1,-1,33,0.0000,0,0.0,0.0,102.0000,0.0,0,0,17330,0,0,Ford,Explorer USPS Electric,N,False,0,0,38,0.0,0.0,0.0,0.0,,62.4074,0.0,46.8056,0.0,Sport Utility Vehicle - 2WD,2001,-1750,,,,,EV,,,67 KW AC Induction,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30969,0.192,0.0,0.0,4.0,120,119.6000,0,0.0,0.0,28.1744,0.0,0,-1,0.0,0.0,105,105.3754,0,0.0,31.9856,0.0,0.0,,,Front-Wheel Drive,37,,10,600,0,Electricity,Electricity,10,-1,92,92.0000,0,0.0,0.0,36.6438,0.0,0,0,38168,0,19,Kia,Soul Electric,N,False,0,97,93,148.0,0.0,113.8,0.0,Automatic (A1),170.9000,0.0,131.4000,0.0,Small Station Wagons,2017,3750,,,,,EV,,,81 kW AC PMSM,KMX,,0.00,,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0
30972,0.204,0.0,0.0,10.0,98,97.5636,0,0.0,0.0,35.0000,0.0,0,-1,0.0,0.0,99,99.0945,0,0.0,34.0000,0.0,0.0,,,Rear-Wheel Drive,2,,10,650,0,Electricity,Electricity,10,-1,101,101.0322,0,0.0,0.0,33.0000,0.0,26,94,38170,0,0,Tesla,Model S (60 kW-hr battery pack),N,False,0,0,210,207.5,0.0,214.8,0.0,Automatic (A1),132.2000,0.0,136.9000,0.0,Large Cars,2016,3500,,,,,EV,,,285 kW AC Induction (60 kW-hr battery pack),TSL,standard charger,3.75,80 amp dual charger,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0
30973,0.192,0.0,0.0,12.0,101,101.4750,0,0.0,0.0,33.0000,0.0,0,-1,0.0,0.0,104,103.7678,0,0.0,32.0000,0.0,0.0,,,All-Wheel Drive,21,,10,600,0,Electricity,Electricity,10,-1,107,106.7148,0,0.0,0.0,32.0000,0.0,26,94,38171,0,0,Tesla,Model S AWD - 60D,N,False,0,0,218,221.1,0.0,226.9,0.0,Automatic (A1),137.5000,0.0,144.6000,0.0,Large Cars,2016,3750,,,,,EV,,,193 (front) 193 (rear) (60 kW-hr battery pack),TSL,standard charger,3.75,80 amp dual charger,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0
30974,0.210,0.0,0.0,12.0,92,92.4713,0,0.0,0.0,36.0000,0.0,0,-1,0.0,0.0,98,97.5232,0,0.0,35.0000,0.0,0.0,,,All-Wheel Drive,15,,10,700,0,Electricity,Electricity,10,-1,105,104.5008,0,0.0,0.0,32.0000,0.0,26,94,38172,0,0,Tesla,Model S AWD - P100D,N,False,0,0,315,305.9,0.0,346.9,0.0,Automatic (A1),125.3000,0.0,141.6000,0.0,Large Cars,2016,3250,,,,,EV,,,193 (front) 375 (rear) (100 kW-hr battery pack),TSL,standard charger,4.75,80 amp dual charger,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0


In [238]:
missing_displacement = vehicles.loc[condition, :]
missing_displacement

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
7138,0.240,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,0.0,0,-1,0.0,0.0,85,0.0000,0,0.0,40.0000,0.0,0.0,,,,0,,-1,800,0,Electricity,Electricity,-1,-1,91,0.0000,0,0.0,0.0,37.0000,0.0,0,0,16423,0,0,Nissan,Altra EV,N,False,0,0,90,0.0,0.0,0.0,0.0,,116.2069,0.0,129.6154,0.0,Midsize Station Wagons,2000,2750,,,,,EV,,,62 KW AC Induction,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
7139,0.282,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,0.0,0,-1,0.0,0.0,72,0.0000,0,0.0,47.0000,0.0,0.0,,,2-Wheel Drive,0,,-1,900,0,Electricity,Electricity,-1,-1,64,0.0000,0,0.0,0.0,53.0000,0.0,0,0,16424,0,0,Toyota,RAV4 EV,N,False,0,0,88,0.0,0.0,0.0,0.0,,116.2069,0.0,91.0811,0.0,Sport Utility Vehicle - 2WD,2000,2250,,,,,EV,,,50 KW DC,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
8143,0.282,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,0.0,0,-1,0.0,0.0,72,0.0000,0,0.0,47.0000,0.0,0.0,,,2-Wheel Drive,0,,-1,900,0,Electricity,Electricity,-1,-1,64,0.0000,0,0.0,0.0,53.0000,0.0,0,0,17328,0,0,Toyota,RAV4 EV,N,False,0,0,88,0.0,0.0,0.0,0.0,,116.2069,0.0,91.0811,0.0,Sport Utility Vehicle - 2WD,2001,2250,,,,,EV,,,50 KW DC,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
8144,0.312,0.0,0.0,0.0,74,0.0000,0,0.0,0.0,46.0000,0.0,0,-1,0.0,0.0,65,0.0000,0,0.0,52.0000,0.0,0.0,,,,0,,-1,1000,0,Electricity,Electricity,-1,-1,58,0.0000,0,0.0,0.0,59.0000,0.0,0,0,17329,0,0,Ford,Th!nk,N,False,0,0,29,0.0,0.0,0.0,0.0,,105.3125,0.0,82.1951,0.0,Two Seaters,2001,1750,,,,,EV,,,27 KW AC Induction,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
8146,0.522,0.0,0.0,0.0,45,0.0000,0,0.0,0.0,75.0000,0.0,0,-1,0.0,0.0,39,0.0000,0,0.0,87.0000,0.0,0.0,,,2-Wheel Drive,0,,-1,1700,0,Electricity,Electricity,-1,-1,33,0.0000,0,0.0,0.0,102.0000,0.0,0,0,17330,0,0,Ford,Explorer USPS Electric,N,False,0,0,38,0.0,0.0,0.0,0.0,,62.4074,0.0,46.8056,0.0,Sport Utility Vehicle - 2WD,2001,-1750,,,,,EV,,,67 KW AC Induction,,,0.00,,Tue Jan 01 00:00:00 EST 2013,Thu Jul 07 00:00:00 EDT 2016,N,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30969,0.192,0.0,0.0,4.0,120,119.6000,0,0.0,0.0,28.1744,0.0,0,-1,0.0,0.0,105,105.3754,0,0.0,31.9856,0.0,0.0,,,Front-Wheel Drive,37,,10,600,0,Electricity,Electricity,10,-1,92,92.0000,0,0.0,0.0,36.6438,0.0,0,0,38168,0,19,Kia,Soul Electric,N,False,0,97,93,148.0,0.0,113.8,0.0,Automatic (A1),170.9000,0.0,131.4000,0.0,Small Station Wagons,2017,3750,,,,,EV,,,81 kW AC PMSM,KMX,,0.00,,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0
30972,0.204,0.0,0.0,10.0,98,97.5636,0,0.0,0.0,35.0000,0.0,0,-1,0.0,0.0,99,99.0945,0,0.0,34.0000,0.0,0.0,,,Rear-Wheel Drive,2,,10,650,0,Electricity,Electricity,10,-1,101,101.0322,0,0.0,0.0,33.0000,0.0,26,94,38170,0,0,Tesla,Model S (60 kW-hr battery pack),N,False,0,0,210,207.5,0.0,214.8,0.0,Automatic (A1),132.2000,0.0,136.9000,0.0,Large Cars,2016,3500,,,,,EV,,,285 kW AC Induction (60 kW-hr battery pack),TSL,standard charger,3.75,80 amp dual charger,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0
30973,0.192,0.0,0.0,12.0,101,101.4750,0,0.0,0.0,33.0000,0.0,0,-1,0.0,0.0,104,103.7678,0,0.0,32.0000,0.0,0.0,,,All-Wheel Drive,21,,10,600,0,Electricity,Electricity,10,-1,107,106.7148,0,0.0,0.0,32.0000,0.0,26,94,38171,0,0,Tesla,Model S AWD - 60D,N,False,0,0,218,221.1,0.0,226.9,0.0,Automatic (A1),137.5000,0.0,144.6000,0.0,Large Cars,2016,3750,,,,,EV,,,193 (front) 193 (rear) (60 kW-hr battery pack),TSL,standard charger,3.75,80 amp dual charger,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0
30974,0.210,0.0,0.0,12.0,92,92.4713,0,0.0,0.0,36.0000,0.0,0,-1,0.0,0.0,98,97.5232,0,0.0,35.0000,0.0,0.0,,,All-Wheel Drive,15,,10,700,0,Electricity,Electricity,10,-1,105,104.5008,0,0.0,0.0,32.0000,0.0,26,94,38172,0,0,Tesla,Model S AWD - P100D,N,False,0,0,315,305.9,0.0,346.9,0.0,Automatic (A1),125.3000,0.0,141.6000,0.0,Large Cars,2016,3250,,,,,EV,,,193 (front) 375 (rear) (100 kW-hr battery pack),TSL,standard charger,4.75,80 amp dual charger,Tue Sep 13 00:00:00 EDT 2016,Tue Sep 13 00:00:00 EDT 2016,N,0,0,0


In [240]:
# Lets just see some columns of it.
missing_displacement = missing_displacement[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
missing_displacement.sample(10)

Unnamed: 0,year,make,model,trany,drive,fuelType,cylinders,displ
27366,2014,Mitsubishi,i-MiEV,Automatic (A1),Rear-Wheel Drive,Electricity,,
28538,2015,Mercedes-Benz,B-Class Electric Drive,Automatic (A1),Front-Wheel Drive,Electricity,,
27320,2013,BYD,e6,Automatic (A1),Front-Wheel Drive,Electricity,,
29941,2016,Tesla,Model S AWD - 90D,Automatic (A1),All-Wheel Drive,Electricity,,
25618,2012,Toyota,RAV4 EV,Automatic (variable gear ratios),Front-Wheel Drive,Electricity,,
28592,2015,Tesla,Model S AWD - 85D,Automatic (A1),All-Wheel Drive,Electricity,,
29935,2016,Tesla,Model S (90 kW-hr battery pack),Automatic (A1),Rear-Wheel Drive,Electricity,,
9212,2002,Toyota,RAV4 EV,,2-Wheel Drive,Electricity,,
28560,2015,Tesla,Model S (85 kW-hr battery pack),Automatic (A1),Rear-Wheel Drive,Electricity,,
24474,2012,Ford,Focus Electric,Automatic (variable gear ratios),Front-Wheel Drive,Electricity,,


In [241]:
missing_displacement

Unnamed: 0,year,make,model,trany,drive,fuelType,cylinders,displ
7138,2000,Nissan,Altra EV,,,Electricity,,
7139,2000,Toyota,RAV4 EV,,2-Wheel Drive,Electricity,,
8143,2001,Toyota,RAV4 EV,,2-Wheel Drive,Electricity,,
8144,2001,Ford,Th!nk,,,Electricity,,
8146,2001,Ford,Explorer USPS Electric,,2-Wheel Drive,Electricity,,
...,...,...,...,...,...,...,...,...
30969,2017,Kia,Soul Electric,Automatic (A1),Front-Wheel Drive,Electricity,,
30972,2016,Tesla,Model S (60 kW-hr battery pack),Automatic (A1),Rear-Wheel Drive,Electricity,,
30973,2016,Tesla,Model S AWD - 60D,Automatic (A1),All-Wheel Drive,Electricity,,
30974,2016,Tesla,Model S AWD - P100D,Automatic (A1),All-Wheel Drive,Electricity,,


In [242]:
vehicles.loc[:, ['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]

Unnamed: 0,year,make,model,trany,drive,fuelType,cylinders,displ
0,1985,Alfa Romeo,Spider Veloce 2000,Manual 5-spd,Rear-Wheel Drive,Regular,4.0,2.0
1,1985,Ferrari,Testarossa,Manual 5-spd,Rear-Wheel Drive,Regular,12.0,4.9
2,1985,Dodge,Charger,Manual 5-spd,Front-Wheel Drive,Regular,4.0,2.2
3,1985,Dodge,B150/B250 Wagon 2WD,Automatic 3-spd,Rear-Wheel Drive,Regular,8.0,5.2
4,1993,Subaru,Legacy AWD Turbo,Manual 5-spd,4-Wheel or All-Wheel Drive,Premium,4.0,2.2
...,...,...,...,...,...,...,...,...
37838,1993,Subaru,Legacy,Automatic 4-spd,Front-Wheel Drive,Regular,4.0,2.2
37839,1993,Subaru,Legacy,Manual 5-spd,Front-Wheel Drive,Regular,4.0,2.2
37840,1993,Subaru,Legacy AWD,Automatic 4-spd,4-Wheel or All-Wheel Drive,Regular,4.0,2.2
37841,1993,Subaru,Legacy AWD,Manual 5-spd,4-Wheel or All-Wheel Drive,Regular,4.0,2.2


In [243]:
## There's some explanation for why the displacement is zero. What is it?

missing_displacement['fuelType'].value_counts()

Electricity    118
Regular          2
Name: fuelType, dtype: int64

In [244]:
missing_displacement.query('fuelType == "Regular"')

Unnamed: 0,year,make,model,trany,drive,fuelType,cylinders,displ
21413,1985,Subaru,RX Turbo,Manual 5-spd,4-Wheel Drive,Regular,,
21414,1985,Subaru,RX Turbo,Manual 5-spd,4-Wheel Drive,Regular,,


In [None]:
vehicles.loc[[21413,21414], 'fuelType'] = 'Regular2'

In [None]:
vehicles.loc[[21413,21414], 'fuelType']

In [247]:
missing_displacement[['displ', 'cylinders']].fillna(0)

Unnamed: 0,displ,cylinders
7138,0.0,0.0
7139,0.0,0.0
8143,0.0,0.0
8144,0.0,0.0
8146,0.0,0.0
...,...,...
30969,0.0,0.0
30972,0.0,0.0
30973,0.0,0.0
30974,0.0,0.0


7138     True
7139     True
8143     True
8144     True
8146     True
         ... 
30969    True
30972    True
30973    True
30974    True
30975    True
Name: fuelType, Length: 120, dtype: bool

Unnamed: 0,cylinders,displ
7138,,
7139,,
8143,,
8144,,
8146,,
...,...,...
30969,,
30972,,
30973,,
30974,,


In [269]:
#mask = missing_displacement['fuelType'] == 'Electricity'
missing_displacement.loc[missing_displacement['fuelType'] == 'Electricity', ['cylinders', 'displ']] = missing_displacement.loc[missing_displacement['fuelType'] == 'Electricity', ['cylinders', 'displ']].fillna(0)


In [271]:
missing_displacement.query('make == "Subaru"')

Unnamed: 0,year,make,model,trany,drive,fuelType,cylinders,displ
21413,1985,Subaru,RX Turbo,Manual 5-spd,4-Wheel Drive,Regular,,
21414,1985,Subaru,RX Turbo,Manual 5-spd,4-Wheel Drive,Regular,,


In [None]:
missing_displacement[['fuelType', 'displ', 'cylinders']].fillna(0)

In [272]:
## So now that I know this, how can I replace the missing values by the value I want?
vehicles.loc[:, ['displ', 'cylinders']] = vehicles.loc[:, ['displ', 'cylinders']].fillna(0)

In [273]:
vehicles.displ.isna().sum()

0

## Use cases
>    - Cheque devolvido = 0 _vs_ Cheque devolvido = NaN

# Correct wrong data

In [274]:
# create a condition for which cylinders value == 0
no_cylinder = (vehicles['cylinders'] == 0)


# create a condition for which displacement is different from 0
yes_displacement = (vehicles['displ'] != 0)



In [279]:
(no_cylinder & yes_displacement).sum()

1

In [280]:
vehicles.loc[no_cylinder & yes_displacement, :]

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
21506,18.311667,0.0,0.0,0.0,15,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,493.722222,18,0.0,0,0.0,0.0,0.0,0.0,0.0,1.3,Rear-Wheel Drive,0,,-1,1850,0,Regular,Regular Gasoline,-1,-1,22,0.0,0,0.0,0.0,0.0,0.0,0,0,29557,0,0,Mazda,RX-7,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,19.0473,0.0,31.1094,0.0,Two Seaters,1986,-2500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [None]:
test = vehicles.loc[no_cylinder & yes_displacement, :]

test[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]

In [288]:
mask = vehicles['make'] == 'Mazda'
vehicles.loc[mask]

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
75,14.330870,0.0,0.0,0.0,21,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Front-Wheel Drive,56051,(FFS),-1,1450,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,14,97,10065,0,0,Mazda,626,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 4-spd,26.0000,0.0,39.7436,0.0,Midsize Cars,1993,-500,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
76,12.677308,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,341.807692,26,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Front-Wheel Drive,56051,(FFS),-1,1300,0,Regular,Regular Gasoline,-1,-1,31,0.0,0,0.0,0.0,0.0,0.0,14,97,10066,0,0,Mazda,626,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0000,0.0,43.0000,0.0,Midsize Cars,1993,250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
77,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,6.0,2.5,Front-Wheel Drive,56052,(FFS),-1,2150,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,14,97,10067,0,0,Mazda,626,Y,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 4-spd,21.0000,0.0,32.0000,0.0,Midsize Cars,1993,-4000,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
78,16.480500,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,444.350000,20,0.0,0,0.0,0.0,0.0,0.0,6.0,2.5,Front-Wheel Drive,56052,(FFS),-1,2000,0,Premium,Premium Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,14,97,10068,0,0,Mazda,626,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0000,0.0,33.3333,0.0,Midsize Cars,1993,-3250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
79,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,6.0,3.0,Rear-Wheel Drive,56070,(FFS),-1,2150,0,Premium,Premium Gasoline,-1,-1,22,0.0,0,0.0,0.0,0.0,0.0,0,0,10069,0,12,Mazda,929,Y,False,0,98,0,0.0,0.0,0.0,0.0,Automatic 4-spd,21.0000,0.0,30.7692,0.0,Midsize Cars,1993,-4000,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37794,12.207778,0.0,0.0,0.0,24,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,1.8,Front-Wheel Drive,56040,SOHC (FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,0,0,9955,0,13,Mazda,Protege,Y,False,0,92,0,0.0,0.0,0.0,0.0,Manual 5-spd,31.0000,0.0,46.0000,0.0,Compact Cars,1993,500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37795,13.184400,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,355.480000,25,0.0,0,0.0,0.0,0.0,0.0,4.0,1.6,Front-Wheel Drive,56030,2VALVE (FFS),-1,1350,0,Regular,Regular Gasoline,-1,-1,30,0.0,0,0.0,0.0,0.0,0.0,16,92,9956,0,0,Mazda,323,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 4-spd,28.8889,0.0,42.0000,0.0,Compact Cars,1993,0,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37796,13.184400,0.0,0.0,0.0,22,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,355.480000,25,0.0,0,0.0,0.0,0.0,0.0,4.0,1.6,Front-Wheel Drive,56033,4VALVE (FFS),-1,1350,0,Regular,Regular Gasoline,-1,-1,29,0.0,0,0.0,0.0,0.0,0.0,16,92,9957,0,0,Mazda,323,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 4-spd,28.0000,0.0,41.0256,0.0,Compact Cars,1993,0,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37797,12.207778,0.0,0.0,0.0,24,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,1.6,Front-Wheel Drive,56033,4VALVE (FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,32,0.0,0,0.0,0.0,0.0,0.0,16,92,9958,0,0,Mazda,323,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,31.0000,0.0,44.8718,0.0,Compact Cars,1993,500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [298]:
left_table = vehicles

In [303]:
right_table = vehicles.groupby(by=['make','year']).median()[['cylinders']]

In [305]:
right_table

Unnamed: 0_level_0,Unnamed: 1_level_0,cylinders
make,year,Unnamed: 2_level_1
AM General,1984,5.0
AM General,1985,5.0
ASC Incorporated,1987,6.0
Acura,1986,5.0
Acura,1987,6.0
...,...,...
smart,2013,1.5
smart,2014,1.5
smart,2015,1.5
smart,2016,1.5


In [307]:
merged_table = pd.merge(left=left_table, right=right_table, on=['make','year'])

In [310]:
merged_table.loc[no_cylinder & yes_displacement, 'cylinders_x'] = merged_table.loc[no_cylinder & yes_displacement, 'cylinders_y']

# Types
>    - dtypes
>    - astype()

In [312]:
vehicles.dtypes

barrels08     float64
barrelsA08    float64
charge120     float64
charge240     float64
city08          int64
               ...   
modifiedOn     object
startStop      object
phevCity        int64
phevHwy         int64
phevComb        int64
Length: 83, dtype: object

In [313]:
vehicles['year']

0        1985
1        1985
2        1985
3        1985
4        1993
         ... 
37838    1993
37839    1993
37840    1993
37841    1993
37842    1993
Name: year, Length: 37843, dtype: int64

In [315]:
pd.to_datetime(vehicles.year, format='%Y')

0       1985-01-01
1       1985-01-01
2       1985-01-01
3       1985-01-01
4       1993-01-01
           ...    
37838   1993-01-01
37839   1993-01-01
37840   1993-01-01
37841   1993-01-01
37842   1993-01-01
Name: year, Length: 37843, dtype: datetime64[ns]

# String normalization

In [316]:
vehicles.trany.value_counts()

Automatic 4-spd                     11042
Manual 5-spd                         8311
Automatic 3-spd                      3151
Automatic (S6)                       2638
Manual 6-spd                         2429
Automatic 5-spd                      2184
Manual 4-spd                         1483
Automatic 6-spd                      1432
Automatic (S8)                        960
Automatic (S5)                        824
Automatic (variable gear ratios)      681
Automatic 7-spd                       663
Automatic (S7)                        261
Auto(AM-S7)                           256
Automatic 8-spd                       243
Automatic (S4)                        233
Auto(AM7)                             160
Auto(AV-S6)                           145
Auto(AM6)                             110
Automatic (A1)                        109
Auto(AM-S6)                            92
Automatic 9-spd                        90
Manual 3-spd                           77
Manual 7-spd                      

In [317]:
vehicles['trany'].unique()

array(['Manual 5-spd', 'Automatic 3-spd', 'Automatic 4-spd',
       'Automatic 5-spd', 'Manual 4-spd', 'Manual 3-spd', 'Manual 6-spd',
       'Automatic (S5)', 'Automatic (variable gear ratios)',
       'Automatic 6-spd', 'Automatic (S6)', nan, 'Automatic (S4)',
       'Automatic 7-spd', 'Automatic (S7)', 'Automatic (S8)',
       'Automatic (AM5)', 'Auto(AM6)', 'Auto(AV-S7)', 'Automatic (A6)',
       'Automatic (AV-S6)', 'Auto(AM7)', 'Manual 4-spd Doubled',
       'Manual 5 spd', 'Automatic (AM6)', 'Manual 7-spd', 'Auto(L4)',
       'Auto(L3)', 'Automatic (AV)', 'Auto (AV-S6)', 'Auto(AM5)',
       'Auto(AV-S6)', 'Auto (AV-S8)', 'Automatic 8-spd', 'Auto(AV-S8)',
       'Automatic (A1)', 'Auto (AV)', 'Auto(AM-S6)', 'Auto(AM-S7)',
       'Automatic 6spd', 'Automatic 9-spd', 'Automatic (S9)',
       'Auto(AM-S8)', 'Auto(A1)', 'Auto(AM8)', 'Manual(M7)',
       'Auto(AM-S9)'], dtype=object)

In [318]:
'Automatic -'.replace('-','')

'Automatic '

In [321]:
vehicles.trany.str.replace('-','')

0           Manual 5spd
1           Manual 5spd
2           Manual 5spd
3        Automatic 3spd
4           Manual 5spd
              ...      
37838    Automatic 4spd
37839       Manual 5spd
37840    Automatic 4spd
37841       Manual 5spd
37842    Automatic 4spd
Name: trany, Length: 37843, dtype: object

In [None]:
vehicles['trany'] = vehicles['trany'].str.replace('-', '')
vehicles['trany'] = vehicles['trany'].str.replace('Automatic', 'Auto')
vehicles['trany'] = vehicles['trany'].str.replace('Auto\(', 'Auto ')
vehicles['trany'] = vehicles['trany'].str.replace('Manual\(', 'Manual ')
vehicles['trany'] = vehicles['trany'].str.replace('\(', '')
vehicles['trany'] = vehicles['trany'].str.replace('\)', '')
print(set(vehicles['trany']))

# Dropping or checking duplicates rows
    
>    - Dropping fully duplicate row
>    - Subset
>    - Idempotency
>    - <b>Duplicated</b>

In [322]:
test = pd.DataFrame({'cpf':[1,2,2,2,2], 'vlr':[10,20,35,20,25], 'ano':[1992, 1993, 1993, 1993, 1994]})

In [323]:
test

Unnamed: 0,cpf,vlr,ano
0,1,10,1992
1,2,20,1993
2,2,35,1993
3,2,20,1993
4,2,25,1994


In [329]:
test.loc[test.duplicated(keep=False), :]

Unnamed: 0,cpf,vlr,ano
1,2,20,1993
3,2,20,1993


In [328]:
test.duplicated(subset=['cpf'] , keep=False)

0    False
1     True
2     True
3     True
4     True
dtype: bool

In [332]:
test.drop_duplicates()

Unnamed: 0,cpf,vlr,ano
0,1,10,1992
1,2,20,1993
2,2,35,1993
4,2,25,1994


In [333]:
test.drop_duplicates(subset=['cpf'])

Unnamed: 0,cpf,vlr,ano
0,1,10,1992
1,2,20,1993


In [336]:
test.groupby('cpf').agg({'vlr':sum, 'ano':'max'}).reset_index()

Unnamed: 0,cpf,vlr,ano
0,1,10,1992
1,2,100,1994
