# Filtering and Ordering

In [86]:
import pandas as pd
pd.set_option('display.max.rows', 10)

In [87]:
df = pd.read_csv(r'/Users/timothypark/Documents/portfolios/timpark99.github.io/Filtering and Ordering in Pandas/Electric_Vehicle_Population_Size_History_By_County_.csv')

In [88]:
df

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
0,September 30 2022,Riverside,CA,Passenger,7,0,7,460,467,1.50
1,December 31 2022,Prince William,VA,Passenger,1,2,3,188,191,1.57
2,January 31 2020,Dakota,MN,Passenger,0,1,1,32,33,3.03
3,June 30 2022,Ferry,WA,Truck,0,0,0,3575,3575,0.00
4,July 31 2021,Douglas,CO,Passenger,0,1,1,83,84,1.19
...,...,...,...,...,...,...,...,...,...,...
20814,January 31 2023,Rockingham,NH,Passenger,1,0,1,14,15,6.67
20815,July 31 2020,Carson City,NV,Passenger,1,0,1,10,11,9.09
20816,February 28 2022,Island,WA,Passenger,744,350,1094,62257,63351,1.73
20817,December 31 2020,San Diego,CA,Passenger,14,2,16,2724,2740,0.58


In [89]:
df.dtypes

Date                                         object
County                                       object
State                                        object
Vehicle Primary Use                          object
Battery Electric Vehicles (BEVs)             object
Plug-In Hybrid Electric Vehicles (PHEVs)     object
Electric Vehicle (EV) Total                  object
Non-Electric Vehicle Total                   object
Total Vehicles                               object
Percent Electric Vehicles                   float64
dtype: object

In [90]:
# initially tried to change dtype to int but could not because the commas were in the object
# replace eliminates the commas in the numbers
# to_numeric will change our data type to int

df = df.replace(',','', regex=True)
df['Total Vehicles'] = pd.to_numeric(df['Total Vehicles'])

In [91]:
df.dtypes

Date                                         object
County                                       object
State                                        object
Vehicle Primary Use                          object
Battery Electric Vehicles (BEVs)             object
Plug-In Hybrid Electric Vehicles (PHEVs)     object
Electric Vehicle (EV) Total                  object
Non-Electric Vehicle Total                   object
Total Vehicles                                int64
Percent Electric Vehicles                   float64
dtype: object

In [92]:
df[df['Total Vehicles'] < 10]

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
8,March 31 2020,DeKalb,IN,Passenger,1,0,1,1,2,50.00
13,November 30 2020,Manassas,VA,Passenger,0,1,1,5,6,16.67
16,May 31 2020,Monroe,IL,Passenger,1,0,1,3,4,25.00
22,February 29 2020,Owyhee,ID,Passenger,1,0,1,3,4,25.00
23,June 30 2022,Clinton,PA,Passenger,1,0,1,1,2,50.00
...,...,...,...,...,...,...,...,...,...,...
20757,January 31 2024,Ulster,NY,Passenger,1,0,1,5,6,16.67
20778,February 28 2019,Franklin,MA,Passenger,0,1,1,1,2,50.00
20789,September 30 2023,Sheridan,WY,Passenger,1,0,1,3,4,25.00
20790,May 31 2022,Eagle,CO,Passenger,1,0,1,7,8,12.50


In [99]:
# for demonstration purposes, drop duplicates of same county name

df['County'].drop_duplicates()


0             Riverside
1        Prince William
2                Dakota
3                 Ferry
4               Douglas
              ...      
9152       Spotsylvania
9504          Worcester
12905             Union
14186        Bernalillo
15083             Brown
Name: County, Length: 312, dtype: object

In [100]:
df

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
0,September 30 2022,Riverside,CA,Passenger,7,0,7,460,467,1.50
1,December 31 2022,Prince William,VA,Passenger,1,2,3,188,191,1.57
2,January 31 2020,Dakota,MN,Passenger,0,1,1,32,33,3.03
3,June 30 2022,Ferry,WA,Truck,0,0,0,3575,3575,0.00
4,July 31 2021,Douglas,CO,Passenger,0,1,1,83,84,1.19
...,...,...,...,...,...,...,...,...,...,...
20814,January 31 2023,Rockingham,NH,Passenger,1,0,1,14,15,6.67
20815,July 31 2020,Carson City,NV,Passenger,1,0,1,10,11,9.09
20816,February 28 2022,Island,WA,Passenger,744,350,1094,62257,63351,1.73
20817,December 31 2020,San Diego,CA,Passenger,14,2,16,2724,2740,0.58


In [101]:
# isin function checks whether each element in the DataFrame is contained in values

specific_counties = ['DeKalb', 'Manassas']
df[df['County'].isin(specific_counties)]

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
8,March 31 2020,DeKalb,IN,Passenger,1,0,1,1,2,50.00
13,November 30 2020,Manassas,VA,Passenger,0,1,1,5,6,16.67
46,April 30 2020,Manassas,VA,Passenger,0,1,1,5,6,16.67
94,May 31 2017,DeKalb,IN,Passenger,1,0,1,1,2,50.00
222,March 31 2022,DeKalb,GA,Passenger,1,0,1,56,57,1.75
...,...,...,...,...,...,...,...,...,...,...
20162,September 30 2018,DeKalb,IN,Passenger,1,0,1,1,2,50.00
20570,January 31 2019,DeKalb,GA,Passenger,1,0,1,79,80,1.25
20639,January 31 2018,DeKalb,IN,Passenger,1,0,1,1,2,50.00
20652,October 31 2023,DeKalb,GA,Passenger,2,0,2,35,37,5.41


In [95]:
# contains function
# na=False ignores null values

df[df['County'].str.contains('Eagle', na=False)]

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
732,July 31 2020,Eagle,CO,Passenger,1,0,1,8,9,11.11
807,October 31 2022,Eagle,CO,Passenger,1,0,1,8,9,11.11
982,May 31 2020,Eagle,CO,Passenger,1,0,1,9,10,10.00
1444,February 28 2022,Eagle,CO,Passenger,1,0,1,7,8,12.50
1525,July 31 2021,Eagle,CO,Passenger,1,0,1,9,10,10.00
...,...,...,...,...,...,...,...,...,...,...
20279,June 30 2019,Eagle,CO,Passenger,1,0,1,6,7,14.29
20358,August 31 2018,Eagle,CO,Passenger,1,0,1,5,6,16.67
20454,June 30 2017,Eagle,CO,Passenger,1,0,1,7,8,12.50
20623,May 31 2018,Eagle,CO,Passenger,1,0,1,6,7,14.29


In [96]:
# filtering on index

df2 = df.set_index('County')
df2

Unnamed: 0_level_0,Date,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Riverside,September 30 2022,CA,Passenger,7,0,7,460,467,1.50
Prince William,December 31 2022,VA,Passenger,1,2,3,188,191,1.57
Dakota,January 31 2020,MN,Passenger,0,1,1,32,33,3.03
Ferry,June 30 2022,WA,Truck,0,0,0,3575,3575,0.00
Douglas,July 31 2021,CO,Passenger,0,1,1,83,84,1.19
...,...,...,...,...,...,...,...,...,...
Rockingham,January 31 2023,NH,Passenger,1,0,1,14,15,6.67
Carson City,July 31 2020,NV,Passenger,1,0,1,10,11,9.09
Island,February 28 2022,WA,Passenger,744,350,1094,62257,63351,1.73
San Diego,December 31 2020,CA,Passenger,14,2,16,2724,2740,0.58


In [97]:
# axis=0 is the row axis and axis=1 is the column axis

df2.filter(items = ['State', 'Vehicle Primary Use'], axis=1)

Unnamed: 0_level_0,State,Vehicle Primary Use
County,Unnamed: 1_level_1,Unnamed: 2_level_1
Riverside,CA,Passenger
Prince William,VA,Passenger
Dakota,MN,Passenger
Ferry,WA,Truck
Douglas,CO,Passenger
...,...,...
Rockingham,NH,Passenger
Carson City,NV,Passenger
Island,WA,Passenger
San Diego,CA,Passenger


In [98]:
df2.filter(items = ['Island'], axis=0)

ValueError: cannot reindex on an axis with duplicate labels