In [1]:
from bs4 import BeautifulSoup as bs
import requests
import os

In [2]:
url = "https://www.usatoday.com/story/news/nation-now/2017/08/30/these-most-popular-cars-and-trucks-every-state/478537001/"

In [3]:
response = requests.get(url)

In [4]:
soup = bs(response.text,'html.parser')

In [5]:
results = soup.find_all('p', class_="p-text")

In [6]:
results[6].text

'Alaska:\xa0F150'

In [7]:
states_cars = [] 
for result in results:
    try:
        state_car = result.text
        states_cars.append(state_car)
        print(state_car)
    except AttributeError as e:
        print(e)

Ford's F150 is America's best selling vehicle, but the pick-up truck doesn't reign supreme in all 50 states.
According to Kelley Blue Book data, more than 10 states choose SUVs over pickups. Compact cars (Honda's Civic, Toyota's Corolla and Yaris) are the top picks in California, Ohio, Florida, Washington, D.C. and Puerto Rico. 
Still, Kelley Blue Book's list of the most popular vehicles sold in each state based on new car registrations in 2016 shows Americans love their trucks. 
This year, the F-Series remains most popular, but Tim Fleming, analyst for Kelley Blue Book said 2017 consumers could shift last year's data. 
"Sedans are rapidly falling out of favor," Fleming said in an email. "The Camry, Civic and Corolla have dropped from the fourth, fifth and sixth places to seventh, eighth and ninth this year. We don’t see this trend stopping any time soon, although the new Camry could boost sales in the short-term."
Here's a look at the 2016 data:
Alaska: F150
Alabama: F150
Arizona: F15

In [8]:
states_cars = states_cars[6:58]

In [9]:
import pandas as pd
df = pd.DataFrame({
    "States_and_Cars":states_cars
})

In [10]:
df.head()

Unnamed: 0,States_and_Cars
0,Alaska: F150
1,Alabama: F150
2,Arizona: F150
3,Arkansas: Sierra 1500
4,California: Civic


In [11]:
df=df['States_and_Cars'].str.split(":", n=2, expand = True)

In [12]:
df.rename(columns = {0:'State',1:'Car'},inplace=True)

In [13]:
df.head()

Unnamed: 0,State,Car
0,Alaska,F150
1,Alabama,F150
2,Arizona,F150
3,Arkansas,Sierra 1500
4,California,Civic


In [14]:
df['Car'].unique()

array(['\xa0F150', ' F150', '\xa0Sierra 1500', ' Civic', ' Outback',
       ' Rogue', '\xa0Silverado 1500', '\xa0Corolla', '\xa0Tacoma',
       '\xa0CR-V', ' RAV4', ' CR-V', '\xa0Escape', ' Accord',
       ' Rogue\xa0', ' Yaris', ' Silverado 1500'], dtype=object)

In [15]:
df['Type'] = df['Car'].map({
'\xa0F150':'Truck', 
    ' F150':'Truck', 
    '\xa0Sierra 1500':'Truck', 
    ' Civic':'Car', 
    ' Outback':'Car',
       ' Rogue':'SUV', 
    '\xa0Silverado 1500':'Truck', 
    '\xa0Corolla':'Car', 
    '\xa0Tacoma':'Truck',
       '\xa0CR-V':'SUV', 
    ' RAV4':'SUV', 
    ' CR-V':'SUV', 
    '\xa0Escape':'SUV', 
    ' Accord':'Car',
       ' Rogue\xa0':'SUV', 
    ' Yaris':'Car', 
    ' Silverado 1500':'Truck'
})

In [16]:
df['Auto_Maker'] = df['Car'].map({
'\xa0F150':'Ford', 
    ' F150':'Ford', 
    '\xa0Sierra 1500':'GMC', 
    ' Civic':'Honda', 
    ' Outback':'Subaru',
       ' Rogue':'Nissan', 
    '\xa0Silverado 1500':'Chevrolet', 
    '\xa0Corolla':'Toyota', 
    '\xa0Tacoma':'Toyota',
       '\xa0CR-V':'Honda', 
    ' RAV4':'Toyota', 
    ' CR-V':'Honda', 
    '\xa0Escape':'Ford', 
    ' Accord':'Honda',
       ' Rogue\xa0':'Nissan', 
    ' Yaris':'Toyota', 
    ' Silverado 1500':'Chevrolet'
})

In [17]:
Cars=[]
item = 0
for item in range(len(df['Car'])):
    new_item = df['Car'][item].replace(u'\xa0',u' ')
    Cars.append(new_item.lstrip().rstrip())
    item += item

In [18]:
df['Car'] = Cars

In [19]:
df['Car'] = df['Car'].replace('F150','F-150')

In [20]:
df

Unnamed: 0,State,Car,Type,Auto_Maker
0,Alaska,F-150,Truck,Ford
1,Alabama,F-150,Truck,Ford
2,Arizona,F-150,Truck,Ford
3,Arkansas,Sierra 1500,Truck,GMC
4,California,Civic,Car,Honda
5,Colorado,Outback,Car,Subaru
6,Connecticut,Rogue,SUV,Nissan
7,"Washington, D.C.",Civic,Car,Honda
8,Delaware,Silverado 1500,Truck,Chevrolet
9,Florida,Corolla,Car,Toyota


In [21]:
clean_cars = df[['Car','Auto_Maker']]

In [22]:
clean_cars.drop_duplicates(subset='Car',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
clean_cars.reset_index(drop=True, inplace=True)

In [24]:
clean_cars.columns

Index(['Car', 'Auto_Maker'], dtype='object')

In [25]:
clean_cars['Car'] = clean_cars['Car'].str.replace(' ','-')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Scrape Edmunds for Vehicle Information

In [26]:
for item in range(len(clean_cars['Car'])):
    if clean_cars['Auto_Maker'][item] == 'Honda' or clean_cars['Auto_Maker'][item] == 'Chevrolet':
        url1 = "https://www.edmunds.com/"+clean_cars['Auto_Maker'][item]+"/"+clean_cars['Car'][item]+"/2016/review/"
    else:
        url1 = "https://www.edmunds.com/"+clean_cars['Auto_Maker'][item]+"/"+clean_cars['Car'][item]+"/2016/" 
    print(url1)
    response1 = requests.get(url1)
    soup1 = bs(response1.text,'html.parser')
    results1 = soup1.find_all('div', class_="text-info")

https://www.edmunds.com/Ford/F-150/2016/
https://www.edmunds.com/GMC/Sierra-1500/2016/
https://www.edmunds.com/Honda/Civic/2016/review/
https://www.edmunds.com/Subaru/Outback/2016/
https://www.edmunds.com/Nissan/Rogue/2016/
https://www.edmunds.com/Chevrolet/Silverado-1500/2016/review/
https://www.edmunds.com/Toyota/Corolla/2016/
https://www.edmunds.com/Toyota/Tacoma/2016/
https://www.edmunds.com/Honda/CR-V/2016/review/
https://www.edmunds.com/Toyota/RAV4/2016/
https://www.edmunds.com/Ford/Escape/2016/
https://www.edmunds.com/Honda/Accord/2016/review/
https://www.edmunds.com/Toyota/Yaris/2016/


In [28]:
requests.get("https://www.edmunds.com/ford/f-150/2016/")

<Response [403]>

img_url = [] #<div class="jfp3ef">
car_name = [] #<div class="BNeawe deIvCb AP7Wnd">
description = [] #<div class="BNeawe deIvCb AP7Wnd">
MSRP =[] #<span class="BNeawe tAd8D AP7Wnd">
MPG = [] #Up to 26 city / 35 highway
Dimensions = [] #span class="BNeawe tAd8D AP7Wnd">
Cargo_Volume = [] #<span class="BNeawe tAd8D AP7Wnd">
Towing_Capacity = [] #<span class="BNeawe tAd8D AP7Wnd">
Comprable_Vehicles = [] #<div class="BNeawe s3v9rd AP7Wnd">

for car in clean_cars:
#     print(car)
    try:
        url1 = "https://www.google.com/search?q=2019"+ car
        response1 = requests.get(url1)
        soup1 = bs(response1.text,'html.parser')
        results0 = soup1.find_all('div', class_="deIvCb")
        results1 = soup1.find_all('img', {"id":"dimg_2"})
        results2 = soup1.find_all('span', class_="tAd8D")
        results3 = soup1.find_all('span', class_="s3v9rd")
        results4 = soup1.find_all('span', class_="oqSTJd")
        results5 = soup1.find_all('span', class_="rQMQod")
        
        print(results0[0].text)
#         print(results1)
        print(results3[0].text)
        print(results2[0].text)
        print(results5[0].text)
        print(results4[0].text)

        
#         state_car = result.text
#         states_cars.append(state_car)
#         print(state_car)
    except AttributeError as e:
        print(e)

len(results0),len(results1),len(results2),len(results3),len(results4),len(results5)

for thing in results3:
    print(thing.text)