In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


def download_page():
    """
    This function downloads a web page and returns the response.
    """
    # Send a request to the website
    url_base = "https://qatarsale.com/en/products/cars_for_sale"
    url_list = [url_base + "?page=" + str(x) for x in range(1, 15)]

    car_sale_df = pd.DataFrame()

    for i in url_list:

        response = requests.get(i)
        # Send a request to the website
         # Check if the request was successful
        if response.status_code != 200:
            print("Failed to download page {}".format(i))
            return None

        # Parse the HTML response
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all the car listings on the page
        car_listings = soup.find_all("div", class_="product-list")

        # Iterate over each car listing
        titles = car_listings[0].find_all("a", class_="product-details")
        prices = car_listings[0].find_all("div", class_="product-price-info")
        parameters = car_listings[0].find_all("a", class_="product-definitions")

        titles_list = [titles[x] for x in range(0,len(titles),2)]

        data = []

        for title, price, parameter in zip(titles_list, prices, parameters):

            parameter_list = parameter.find_all("span", class_="def-value")

            # Print the title and price 
            data.append({"car type": title.text.strip(),
                        "price": price.text.strip().replace(' Q.R','').replace(',',''),
                        "mileage": parameter_list[0].text.strip().replace(' Km','').replace(',',''),
                        "gear_type": parameter_list[1].text.strip(),
                        "year": parameter_list[2].text.strip(),
                        "cynlinder": parameter_list[3].text.strip(),              
                        })

        page_df = pd.DataFrame(data)
        car_sale_df = pd.concat([car_sale_df, page_df])



    return car_sale_df



In [2]:
car_sale_df = download_page()

In [3]:
car_sale_df

Unnamed: 0,car type,price,mileage,gear_type,year,cynlinder
0,Land Rover Range Rover Vogue HSE,830000,0,Automatic,2023,8
1,Toyota Land Cruiser GXR- Grand Touring,223000,14000,Automatic,2021,6
2,Toyota Land Cruiser GXR,245000,51000,Automatic,2022,6
3,Land Rover Range Rover Vogue Super charged,369000,27000,Automatic,2020,8
4,Lexus LX 600 Luxury,669000,0,Automatic,2023,6
...,...,...,...,...,...,...
19,Mercedes-Benz E-Class 200,188000,25000,Automatic,2021,4
20,Land Rover Range Rover Sport Super charged,133000,110000,Automatic,2016,8
21,Ford Raptor SVT,168000,97000,Automatic,2017,6
22,BMW X-Series X5 M,150000,96000,Automatic,2016,8


In [4]:
def remove_non_numeric(df, column):
    """
    This function removes non-numeric characters from a column in a dataframe.
    """
    df[column] = df[column].str.replace(r'\D+', '')
    return df

car_sale_df = remove_non_numeric(car_sale_df, "cynlinder")
car_sale_df = remove_non_numeric(car_sale_df, "year")


  df[column] = df[column].str.replace(r'\D+', '')


In [5]:
car_sale_df= car_sale_df[~(car_sale_df['year'] == 'Automatic')]

In [6]:
car_sale_df.cynlinder.unique()

array(['8', '6', '4', '0', '12'], dtype=object)

In [18]:

car_sale_df["car type"] = car_sale_df["car type"].astype('string')
car_sale_df["price"] = car_sale_df["price"].astype(float)
car_sale_df["mileage"] = car_sale_df["mileage"].astype(float)
car_sale_df["gear_type"] = car_sale_df["gear_type"].astype('string')
car_sale_df["year"] = car_sale_df["year"].astype(int)
car_sale_df["cynlinder"] = car_sale_df["cynlinder"].astype(int)

def add_date_to_df(df):
    """
    This function adds the current date to a dataframe.
    """
    df["date"] = pd.to_datetime('today').normalize()
    return df

car_sale_df = add_date_to_df(car_sale_df)



In [20]:
car_sale_df.dtypes
car_sale_df.head()

Unnamed: 0,car type,price,mileage,gear_type,year,cynlinder,date
0,Land Rover Range Rover Vogue HSE,830000.0,0.0,Automatic,2023,8,2023-01-27
1,Toyota Land Cruiser GXR- Grand Touring,223000.0,14000.0,Automatic,2021,6,2023-01-27
2,Toyota Land Cruiser GXR,245000.0,51000.0,Automatic,2022,6,2023-01-27
3,Land Rover Range Rover Vogue Super charged,369000.0,27000.0,Automatic,2020,8,2023-01-27
4,Lexus LX 600 Luxury,669000.0,0.0,Automatic,2023,6,2023-01-27


In [21]:
import plotly.express as px 

fig = px.scatter(car_sale_df, x="mileage", y="price", color="gear_type", size="cynlinder", hover_data=['car type', 'year']) 
fig.show()

In [22]:
car_sale_df['car type'].value_counts()

                                                     58
Mercedes-Benz  G-Class  63 AMG                       10
Toyota  Land Cruiser  GXR Twin Turbo                 10
Toyota  Land Cruiser  GXR                             9
Toyota  Land Cruiser  VXR                             8
                                                     ..
Land Rover  Range Rover  Vogue SE Super charged L     1
BMW  M-Series  2                                      1
Audi  Q5  45 TFSI Quattro S-Line                      1
Audi  Q3  35 TFSI                                     1
BMW  X-Series  X5 M                                   1
Name: car type, Length: 171, dtype: Int64

Bad pipe message: %s [b"\xa9\x17\xd6\xd5\xe6_2\xbcl\x96\x13\xfcS\xb7\x99\xae\x84\xeb t\x9f\xfd8\x98\x8a\x8d\x96K\xcb\xb739r\xe9X+\x06pA\x0e\x0e\x8a'"]
Bad pipe message: %s [b'M\x83\x0c\xf3\xf3\x8a\xb5h\x04\xf3\x9a\x0f\xc9X\xbf\x1aG\xd3 \xd8\x9d(\xd4\xec\xb44\xa79L\xcf\xc7\x85q\x01Q\xb0\x83\x96\xd5#\xf6\x9aFE\xe3\x11', b'\xc8\x85*\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00']
Bad pipe message: %s [b'N\xf2\xcd\x94\x97i\xca\xab*\xb9g\x875\x8e\x87\xb3\x14\x0e\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0']
Bad pipe message: %s [b"\x15\xe9\x95\xf2)w\x0e\xb7\xb4v\x02\x98+N_\x81_\xd4\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9