In [49]:
from bs4 import BeautifulSoup as soup # Web scrapping
import pandas as pd
import requests
import numpy as np
import math #for ceil
import time 
import random
from tqdm import tqdm # Progression bar
import re #regular expression
from difflib import SequenceMatcher #compare similarity between 2 string
import glob, os #To read folder name


In [63]:
label=pd.read_csv("Output/Make_label.csv")
label.head()

Unnamed: 0,Make,Label
0,Acura,20001
1,Am General,20002
2,Aston Martin,20003
3,Austin Healey,20004
4,BMW,20005


In [3]:
# Return the URL function
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def get_url(input_):
    make_index=np.argmax([similar(input_, i) for i in label.Make])
    make=label.Make[make_index]
    print("You choose {} with corresponding index {}".format('\033[1m'+make+'\033[0m', label.Label[make_index]))
    print("Now retrieving the data ...")

    url=("https://www.cars.com/for-sale/searchresults.action/"
    "?dealerType=localOnly&mkId={}&page=1&perPage=100&rd=250"
    "&searchSource=GN_BREADCRUMB&sort=relevance&zc=77083").format(label.Label[make_index])
    return url, make

In [4]:
def retrieve_data(url,make):
    names=[]
    Condition=[]
    mileages=[]
    MSRP=[]
    Prices=[]
    Ext_color=[]
    Int_color=[]
    Tranmission=[]
    Drivetrain=[]
    Dealer_name=[]
    Number_Review=[]
    Rating=[]
    List_Distance=[]
    HTML=[]
    
    response = requests.get(url)
    html_soup=soup(response.text, "html.parser")
    results=int(html_soup.find_all(class_='filter-count')[0].text.replace(",",""))
    num_page=math.ceil(results/100)
    
    for page_num in tqdm(range(1,num_page+1)):
        url_new=url.replace("page=1","page={}".format(page_num))

        response = requests.get(url_new)
        html_soup=soup(response.text, "html.parser")

        for index, item in enumerate(html_soup.find_all(class_="listing-row__details")):
            
            #Get car name (Always available)
            car_name=item.find(class_="listing-row__title").string.strip() #Strip remove space beginning and end
            names.append(car_name)
            
            #Get car condition
            car_cond=item.find(class_="listing-row__stocktype") #Strip remove space beginning and end

            if car_cond.string==None:
                car_cond=item.find(class_="listing-row__stocktype-cpo")
                if car_cond.string==None:
                    Condition.append("NA")
                else:
                    Condition.append(car_cond.string.strip())
            else:
                Condition.append(car_cond.string.strip())

            #Get mileage (Not always available)
            car_mileage=item.find(class_="listing-row__mileage")
            if car_mileage==None:
                mileages.append(0)
            else:
                car_mileage=car_mileage.string.strip().split()[0].replace(",","")
                try:
                    car_mileage=int(car_mileage)
                except:
                    car_mileage=0
                mileages.append(car_mileage)

            #Get car MSRP
            car_MSRP=item.find(class_="listing-row__msrp")
            if car_MSRP==None:
                MSRP.append(0)
            else:
                car_MSRP=car_MSRP.string.strip()[6:].replace(",","")
                try:
                    car_MSRP=int(car_MSRP)
                except:
                    car_MSRP=0
                MSRP.append(car_MSRP)

            #Get car price
            car_price=item.find(class_="listing-row__price")
            if (car_price.string==None) or ("Not" in car_price.string):
                Prices.append(0)
            else:
                car_price=car_price.string.split()[0].replace('$',"").replace(',',"")
                try:
                    car_price=int(car_price)
                except:
                    car_price=0
                Prices.append(car_price)

            #Get car Ext_color, Int_color, Tranmission, Drivetrain
            Ext_color.append(item.find_all('li')[0].text.strip().split()[2])
            Int_color.append(item.find_all('li')[1].text.strip().split()[2])
            Tranmission.append(item.find_all('li')[2].text.strip().split()[1])
            Drivetrain.append(item.find_all('li')[3].text.strip().split()[1])

            #Get dealer name, number of review, rating, distance
            Dealer_name.append(item.find(class_="dealer-name").find("span").string)

            #Number of review
            car_number_review=item.find(class_="listing-row__review-number")
            if car_number_review==None:
                Number_Review.append(0)
            else:
                car_number_review=car_number_review.text
                car_number_review=re.findall(r'\d+', car_number_review)
                car_number_review=car_number_review[0]
                try:
                    car_number_review=int(car_number_review)
                except:
                    car_number_review=0
                Number_Review.append(car_number_review)
                
            #Rating
            car_rating=item.find(class_="dealer-rating-stars")
            if car_rating==None:
                Rating.append(0)
            else:
                car_rating=car_rating.text.replace("\n","").split()[0]
                try:
                    car_rating=float(car_rating)
                except:
                    car_rating=0
                Rating.append(car_rating)

            #Listing Distance
            car_distance=item.find(class_="listing-row__distance")
            if car_distance==None:
                List_Distance.append(0)
            else:
                car_distance=car_distance.text.split()[0]
                try:
                    car_distance=float(car_distance)
                except:
                    car_distance=0
                List_Distance.append(car_distance)
            
            #Link
            html_tail=item.find(class_="listing-row__compare-button checkbox").input['data-compare-url']
            HTML.append("cars.com"+html_tail)
            
            tqdm._instances.clear() #Only allow 1 progression bar
            
    df=pd.DataFrame({})
    df["Year"]=[i[:5] for i in names]
    df["Make"]=[make for i in names]
    df["Model"]=[i.replace(df.Year[index], "").replace(df.Make[index], "").strip() for index, i in enumerate(names)]
    df['Condition']=Condition
    df["Mileage"]=mileages
    df["MSRP"]=MSRP
    df["Ext_color"]=Ext_color
    df["Int_color"]=Int_color
    df["Tranmission"]=Tranmission
    df["Drivetrain"]=Drivetrain
    df["Dealer_name"]=Dealer_name
    df["Number_Review"]=Number_Review
    df["Rating"]=Rating
    df["List_Distance"]=List_Distance
    df["Price"]=Prices
    df["HTML"]=HTML
    return df

In [5]:
#Test

# url, make=get_url("Acura")
# url='https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mkId=20001&page=27&perPage=100&rd=250&searchSource=GN_BREADCRUMB&sort=relevance&zc=77083'
# response = requests.get(url)
# html_soup=soup(response.text, "html.parser")
# for foo in html_soup.find_all(class_="listing-row__details"):
#     a=foo.find(class_="listing-row__mileage")
#     if a== None:
#         pass
#     else:
#         print(a.string.strip().split()[0].replace(",",""))

# url, make=get_url("Am General")
# df=retrieve_data(url,make)

In [65]:
def get_all_data():

    for i in range(88,len(label)):
        car_make=label.Make[i]
        url,make=get_url(car_make)
        try:
            df=retrieve_data(url,make)
            df.to_csv("Output/Car_data/{}.csv".format(make), index=False)
        except:
            print("SKIP ", car_make)
        
        time.sleep(random.random()*3) #Avoid getting caught LOL
        
    return 

In [58]:
def create_meta_source():
    parent_dir = "Output/Car_data/"
    results = [os.path.basename(f) for f in glob.glob(os.path.join(parent_dir, '*.csv'))]
    print(len(results))

    meta_df=pd.DataFrame({})
    for i in results:
        if i != "meta_df.csv":
            df=pd.read_csv(parent_dir+i)
            meta_df=pd.concat([meta_df,df], ignore_index=True)

    meta_df.to_csv("Output/Car_data/meta_df.csv", index=False)
    return meta_df

In [66]:
%%time
get_all_data()

You choose Volkswagen with corresponding index 20089
Now retrieving the data ...


100%|██████████| 54/54 [01:37<00:00,  1.81s/it]


You choose Willys with corresponding index 20090
Now retrieving the data ...


100%|██████████| 1/1 [00:00<00:00,  2.20it/s]


Wall time: 1min 43s


In [67]:
df=create_meta_source()

77


In [68]:
df.head()

(283086, 16)