# Electric Vehicle Data Scraping

This notebook collects data on electric vehicles from  from https://www.fueleconomy.gov/. The data we are interested in is the MPGe, price, and driving range for different models of purely electric vehicles (not hybrids) from 2011 to 2023. After collecting the relevant data and organizing it into a pandas DataFrame, this notebook saves the results as a .csv file.

In [1]:
# Import the necessary Python libraries:
from urllib.request import urlopen # to grab data from html
import pandas as pd

In [2]:
# Create an empty pandas DataFrame with columns corresponding to the features we will grab:
df = pd.DataFrame(columns=['Year', 'Model', 'Config', 'MPGe_combined', 'MPGe_city', 'MPGe_highway', 
                           'price_low', 'price_high', 'range_in_miles'])


years = list(range(2011, 2024, 1)) # The years of interest: 2011-2023
NaN_vals = [] # For tracking which car models have data that doesn't import correctly or is missing

for year in years: # Loop over all years of interest
    url = 'https://www.fueleconomy.gov/feg/PowerSearch.do?action=PowerSearch&year1=' + str(year) + '&year2=' + str(year) + '&minmsrpsel=0&maxmsrpsel=0&city=0&highway=0&combined=0&cbftelectricity=Electricity&YearSel=' + str(year) + '&MakeSel=&MarClassSel=&FuelTypeSel=Electricity&VehTypeSel=Electric&TranySel=&DriveTypeSel=&CylindersSel=&MpgSel=000&sortBy=Comb&Units=&url=SearchServlet&opt=new&minmsrp=0&maxmsrp=0&minmpg=0&maxmpg=0&sCharge=&tCharge=&startstop=&cylDeact=&rowLimit=200'
    page = urlopen(url)
    html_bytes = page.read()
    html = html_bytes.decode("utf-8").split('\n')

    car_block_indices = [] # For storing the index of the beginning line of each car block in the html object
    for i in range(len(html)):
        if 'ymm-row' in html[i]: # Each car block begins with a line that contains "ymm-row"
            car_block_indices.append(i)
    car_block_indices.pop(0) # Remove the first index where "ymm-row" is found, which doesn't correspond to a car
    car_block_indices.append(car_block_indices[-1] + 134) # Add an index for after the last car block is done


    # Grab data:
    for car_index in range(len(car_block_indices) - 1): # Loop over all cars in the current year:
        car_info = {} # Dictionary containing data for the current car model
        car_info['Year'] = year # year
        city_MPG = False # For tracking if we added the city MPG yet for the current car
        for i in range(car_block_indices[car_index], car_block_indices[car_index + 1]):
            if 'sbs&amp' in html[i]: # car name
                # Find the indices for the beginning and end of the string we want to grab:
                char_index1 = html[i].index('>') + 1
                char_index2 = html[i].index('</')
                # Add data to car_info dictionary with the appropriate column name as the key
                try:
                    car_info['Model'] = html[i][char_index1:char_index2]
                except:
                    NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'Model'})
            # Repeat this process for the rest of the features we are interested in:
            if 'config' in html[i]: # config
                char_index1 = html[i].index('>') + 1
                char_index2 = html[i].index('</')
                try:
                    car_info['Config'] = html[i][char_index1:char_index2]
                except:
                    NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'Config'})
            if 'mpg-comb' in html[i]: # combined mpg
                char_index1 = html[i].index('>') + 1
                char_index2 = html[i].index('</')
                try:
                    car_info['MPGe_combined'] = int(html[i][char_index1:char_index2])
                except:
                    NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'MPGe_combined'})
            if 'ctyhwy' in html[i]: # city mpg, highway mpg
                char_index1 = html[i].index('>') + 1
                char_index2 = html[i].index('</')
                if city_MPG == False: # We haven't added city MPG yet 
                    try:
                        car_info['MPGe_city'] = int(html[i][char_index1:char_index2])
                        city_MPG = True # We just added the city MPG, so don't try to add it again 
                                        # (the next instance of 'ctyhwy' is for the highway MPG)
                    except:
                        NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'MPGe_city'})
                else: # city_MPG == True, so we already added the city MPG. The current line is the highway MPG.
                    try:
                        car_info['MPGe_highway'] = int(html[i][char_index1:char_index2])
                    except:
                        NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'MPGe_highway'})
            if 'msrp' in html[i]: # MSRP
                price_range = []
                for j in range(i + 1, car_block_indices[car_index + 1]):
                    if '$' in html[j]:
                        char_index1 = html[j].index('$') + 1
                        price_range.append(html[j][char_index1:-1].replace(',', ''))
                    if '/td' in html[j]:
                        if len(price_range) == 1: # This car doesn't have a price range, just a single value
                            price_range.append(price_range[0]) # Fill in high MSRP with the single value as well
                                                               # (so price_low == price_high for this car)
                        try:
                            car_info['price_low'] = int(price_range[0])
                            car_info['price_high'] = int(price_range[1])
                        except:
                            NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'price'})
                        break
            if 'EPA range' in html[i]: # range in miles
                char_index1 = html[i].index('range: ') + 7
                char_index2 = html[i].index(' miles')
                try:
                    car_info['range_in_miles'] = int(html[i][char_index1:char_index2])
                except:
                    NaN_vals.append({'Year':year, 'Index':car_index, 'Field':'range_in_miles'})
        # Now that we've grabbed all the data for this car, add it to the DataFrame
        try:
            df.loc[len(df.index)] = car_info
        except:
            print("ERROR: invalid info for year", year, "index", car_index)

        

# Once we've finished looping through all the cars in all the years, our DataFrame is complete.
# Print out the first few rows of the DataFrame to make sure it looks okay.
df.head()

Unnamed: 0,Year,Model,Config,MPGe_combined,MPGe_city,MPGe_highway,price_low,price_high,range_in_miles
0,2011,2011 BMW Active E,"Automatic (A1), Electricity",102,107,96,,,94
1,2011,2011 Nissan Leaf,"Automatic (A1), Electricity",99,106,92,32780.0,32780.0,73
2,2011,2011 smart fortwo electric drive cabriolet,"Automatic (A1), Electricity",87,94,79,,,63
3,2011,2011 smart fortwo electric drive coupe,"Automatic (A1), Electricity",87,94,79,,,63
4,2012,2012 Mitsubishi i-MiEV,"Automatic (A1), Electricity",112,126,99,29125.0,31125.0,62


In [3]:
# Export the DataFrame to a .csv file
print('Exporting data to .csv file...')
df.to_csv('electric_vehicle_data.csv', index=False)
print('Complete')

Exporting data to .csv file...
Complete
