In [None]:
import os
import sys

import pandas as pd
import requests
import random
import time
import datetime as dt
import logging
import json

cur_dir = os.getcwd()
SRC_PATH = cur_dir[: cur_dir.index("fortunato-wheels-engine") + len("fortunato-wheels-engine")]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

from src.websites.kijiji import get_kijiji_car_ads, get_kijiji_car_ad_pages

# Create a custom logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
if len(logger.handlers) != 0:
    logger.handlers.clear()
    print("Cleared exisiting handlers")

logger.addHandler(ch)
logger.propagate = False

%load_ext autoreload
%autoreload 2

# Kijiji Ad Scraping Development

The approach taken to scrape Kijiji ads is to identify it's own internal API's which return the car ad details to the front end and use those instead of the raw HTML. This allows us to get the data in a structured format and avoid the need to parse the HTML.

Two API's were identified:
1. The search results page returns a JSON object with multiple car ad details
   - this had the drawback that once we hit ~1000 ads the next_page_tokens stop being generated

2. Individual car ad pages return a JSON object with the car ad details
   - this had the drawback that we had to make a request for each car ad
   - can exploit numerical exploration of the ad id's to get all the ads

The primary challenges overcome when setting up this project were:
- Figuring out the API's, general steps were:
  1. Open chrome and go to the page and open More Tools --> Developer Tools
  2. In there, go to Network and refresh the page
  3. Select the Fetch/XHR tab and look at the Name of the items on the left, one will have some sort of ad/car id in it likely
  4. Click through and inspect the Response tab to see the JSON object returned and find the data we want
  5. Right click the Name and select Copy --> Copy as cURL (bash)
  6. Paste it into a Postman session and run it to see the response
  7. Deselect as much of the Headers as possible to get the cleanest request that still returns the response
  8. Turn the results into header dictionary and request cookies dictionary
  9. Setup the proxy from https://www.webshare.io/ as a dictionary
  10. Pass all three parts as part of the request to the API like below

```
  res = session.get(url, proxies=proxy, headers=header, cookies=cookie)
```

In [None]:
session = requests.Session()

response = session.get("https://www.kijijiautos.ca")

cookies_dict = response.cookies.get_dict()
cookies_dict

In [None]:
# testing assembly of the cookies
cookie = {
    "mvcid": cookies_dict["mvcid"],
    # "mvcid": "804ca76e-48fe-4026-b099-134b2ccaee14",
    "trty": "e",
    "locale": "en-CA",
    "GCLB": cookies_dict["GCLB"],
    # "GCLB": "CK_4wqOhg77K0gE",
    "disableLoginPrompt": "true",
    "location": "%7B%7D",

}

header = {
    "url": "%2Fcars%2F",
    "User-Agent": "com.ebay.kijiji.ca 6.5.0 (samsung SM-G930U; Android 8.0.0; en_US)",
    "accept-language": "en-CA",
    "Accept": "*/*",
    "Connection": "keep-alive",
    # "Pragma": "no-cache",
    # "Authorization": "Basic Y2FfYW5kcm9pZF9hcHA6YXBwQ2xAc3NpRmllZHMh",
    # "Host": "mingle.kijiji.ca",
    "Accept-Encoding": "gzip, deflate, br",
    # "url": "%2Fcars%2F",
    "referer": "https://www.kijijiautos.ca/cars/",
    "x-client": "ca.move.web.app",
}

# cookie = dict(Cookie = f"mvcid={cookies_dict['mvcid']}; trty=e; locale=en-CA; GCLB={cookies_dict['GCLB']}; disableLoginPrompt=true; location=%7B%7D;"
#     " c-client=ca.move.web.app;")
print(cookie)
print(header)

Check that the proxy is correctly being used by checking the IP address in the response. If it's not, then the proxy is not being used and you'll get blocked.

In [None]:
from src.websites.kijiji import get_proxy_details

proxy = get_proxy_details()

res = session.get(url="http://icanhazip.com", headers=header, cookies=cookie)

print("Without Proxy IP Address: ", res.text)

res = session.get(url="http://icanhazip.com", headers=header, cookies=cookie, proxies=proxy)

print("With Proxy IP Address: ", res.text)

Test that the proxy can be used by making a request to a website that is not blocked, like my website. If that works, then the proxy is working. Can view that realtime somebody s detected and location isn't Vanocuver in Google Analytics.

In [None]:
# original code from here: https://stackoverflow.com/questions/55582136/how-to-set-proxy-with-authentication-in-selenium-chromedriver-python
from selenium import webdriver
import time

import os
import zipfile

from selenium import webdriver

PROXY_HOST = "2.56.119.93"  # rotating proxy or host
PROXY_PORT = 5074 # port
PROXY_USER = 'qnjsnurv' # username
PROXY_PASS = '7r9lv8kv4vwo' # password


manifest_json = """
{
    "version": "1.0.0",
    "manifest_version": 2,
    "name": "Chrome Proxy",
    "permissions": [
        "proxy",
        "tabs",
        "unlimitedStorage",
        "storage",
        "<all_urls>",
        "webRequest",
        "webRequestBlocking"
    ],
    "background": {
        "scripts": ["background.js"]
    },
    "minimum_chrome_version":"22.0.0"
}
"""

background_js = """
var config = {
        mode: "fixed_servers",
        rules: {
        singleProxy: {
            scheme: "http",
            host: "%s",
            port: parseInt(%s)
        },
        bypassList: ["localhost"]
        }
    };

chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});

function callbackFn(details) {
    return {
        authCredentials: {
            username: "%s",
            password: "%s"
        }
    };
}

chrome.webRequest.onAuthRequired.addListener(
            callbackFn,
            {urls: ["<all_urls>"]},
            ['blocking']
);
""" % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS)


def get_chromedriver(use_proxy=False, user_agent=None):
    # path = os.path.dirname(os.path.abspath(__file__))
    path = os.getcwd()
    chrome_options = webdriver.ChromeOptions()
    if use_proxy:
        pluginfile = 'proxy_auth_plugin.zip'

        with zipfile.ZipFile(pluginfile, 'w') as zp:
            zp.writestr("manifest.json", manifest_json)
            zp.writestr("background.js", background_js)
        chrome_options.add_extension(pluginfile)
    if user_agent:
        chrome_options.add_argument('--user-agent=%s' % user_agent)
    driver = webdriver.Chrome(
        # os.path.join(path, 'chromedriver'),
        chrome_options=chrome_options)
    return driver

def main():
    driver = get_chromedriver(use_proxy=True)
    # driver.get('https://www.google.com/search?q=my+ip+address')
    driver.get('https://ty-andrews.com/')
    time.sleep(60)

if __name__ == '__main__':
    main()

In [None]:
kj_url = 'https://www.kijijiautos.ca/' # consumer/srp/' #by-url?url=%2Fcars%2F'

# res = session.get(url="https://www.kijijiautos.ca/consumer/srp/by-url?url=%2Fcars%2F", headers=header, cookies=cookie)

# get a page of car ads by make id 24100
# res = session.get(url="https://www.kijijiautos.ca/consumer/srp/by-params?ms=24100", headers=header, cookies=cookie)

# get a single car ad by id 1500000000
res = session.get(url="https://www.kijijiautos.ca/consumer/svc/a/28600002", headers=header, cookies=cookie)
# look at payload of request response
res.json()

j = res.json()

In [None]:
j

For the page by page scraping save off a sample ad to use for testing of the parsing toolset.

In [None]:
# with open(
#     os.path.join(os.getcwd(), "..", "data", "testing", "multi-page-ad-scraping-sample-ad.json"),
#     "w",
# ) as f:
#     json.dump(j["listings"]["items"][0], f, indent=4)

with open(
    os.path.join(os.getcwd(), "..", "data", "testing", "single-ad-scraping-sample-ad.json"),
    "w",
) as f:
    json.dump(j, f, indent=4)

## Running the Scraping

There are 2 methods for scraping, by the main page or by the individual ad pages. The main page method is faster but only returns the first ~1000 ads. The individual ad page method is slower but returns all the ads.

Key pieces setup for both methods are:
- After a batch of requests the user-agent is changed to avoid being blocked and a new session is created with new cookies
- the time between subsequent requests is randomized to 1-2 times the previous response delay to hopefully not overload the server
- `ad_id`'s that are not found are logged to a file `data/scraping-tracking/failed-ad-ids.json` with the response error code for debugging. This is used ensure they are not repeated later on subsequent runs of the scraping
- Ads are uploaded in batches set in the primary scraping functions

The two functions are:
1. `get_kijiji_car_ads` - this uses individual ad_id's to get the ad details
2. `get_kijiji_car_ad_pages` - this uses the main page to get the ad details in batches

In [None]:
import pymongo
# collection.create_index([("ad_id", pymongo.DESCENDING)], unique=True)
last_ad_id

In [None]:
# having pauses between batches of requests
# last_ad_id = 28629000
for _ in range(20):

    max_requests = random.randint(600, 900)
    df, json_dict, last_ad_id = get_kijiji_car_ads(
        max_requests=max_requests,
        batch_upload_size=50, 
        start_id=last_ad_id
    )
    sleep = random.randint(20, 40)
    print(f"Completed {max_requests} requests. Sleeping for {sleep} seconds...")
    time.sleep(sleep)

Scrape by car make web page.

In [None]:
"make" in [1,2,3]

In [None]:
with open(
    os.path.join(
        os.getcwd(), os.pardir, "data", "scraping-tracking", "car-brands.json"
    )
) as f:
    car_manufacturers = json.load(f)

# for car_make in car_manufacturers["car_makes"]:
for car_make in [
        # # high volume batch, did 80 p, 20 per
        # [24100, "Toyota"],
        # [11000, "Honda"],

        # Batch 2, 80p 40 per
        # [23500, "Subaru"],
        # [4, "Polestar"],
        # [135, "Tesla"],
        # [1900, "Audi"],
        # [15200, "Lexus"],
        # [25200, "Volkswagen"],

        # [375, "Acura"],
        # [3500, "BMW"],
        # [25100, "Volvo"],

        # low volume batch
        # [4350, "Bugatti"],
        # [14600, "Lamborghini"],
        # [137, "McLaren"],
        # [20100, "Porsche"],
        # [3100, "Bentley"],
        # [1700, "Aston Martin"],
        # [21600, "Rolls-Royce"],
        # [900, "Alfa Romeo"],
        # [8600, "Ferrari"],
        # [31859, "Mercedes-AMG"],

        # [2, "AMC"],
        # [266, "AM General"],
        # [1950, "Austin Healey"],
        # [268, "Bricklin"],
        # [4400, "Buick"],
        # [4700, "Cadillac"],
        # [5600, "Chevrolet"],
        # [5700, "Chrysler"],
        # [6800, "Daewoo"],
        # [7000, "Daihatsu"],
        # [30002, "Datsun"],
        # [7700, "Dodge"],
        # [30003, "Eagle"],
        # [8800, "Fiat"],
        # [9000, "Ford"],
        # [270, "Genesis"],
        # [269, "Geo"],
        # [9900, "GMC"],
        # [11050, "Hummer"],
        # [11600, "Hyundai"],
        # [11650, "Infiniti"],
        # [271, "International Harvester"],
        # [11900, "Isuzu"],
        # [12400, "Jaguar"],
        # [12600, "Jeep"],
        # [13200, "Kia"],
        # [14800, "Land Rover"],
        # [15500, "Lincoln"],
        # [15900, "Lotus"],
        # [16600, "Maserati"],
        # [16700, "Maybach"],
        # [16800, "Mazda"],
        # [17200, "Mercedes-Benz"],
        # [30010, "Mercury"],
        # [17300, "MG"],
        # [17500, "MINI"],
        # [17700, "Mitsubishi"],
        # [18700, "Nissan"],
        # [18975, "Oldsmobile"],
        # [19000, "Opel"],
        # [1400, "Other"],
        # [19300, "Peugeot"],
        # [19800, "Plymouth"],
        # [20000, "Pontiac"],
        # [267, "RAM"],
        # [20700, "Renault"],
        # [21800, "Saab"],
        # [30014, "Saturn"],
        # [39, "Scion"],
        # [30015, "Shelby"],
        # [23000, "Smart"],
        # [23600, "Suzuki"],
        # [24400, "Triumph"],
    ]:

    make = car_make[1].lower().replace(" ", "-")
    make_number = car_make[0]

    # scrape the kijiji car ads for each car make
    df, json_dict, next_page_url = get_kijiji_car_ad_pages(
        car_make=make,
        make_number=make_number,
        num_pages = 100,
        page_size = 40, 
        batch_upload_size=20
    )

Checking how many items are in the database.

In [None]:
from src.websites.kijiji import connect_to_database

client, db, collection = connect_to_database()

collection.count_documents({})

Check for duplicates in the database.

In [None]:
duplicates = list(
        collection.aggregate([
            {'$group': {
                '_id': {
                    'make': "$make",
                    'model': "$model",
                    'year': "$year",
                    "title": "$title",
                    "color": "$color",
                    "trim": "$trim",
                    "price": "$price",
                    "seller.sellerForeignId": "$seller.sellerForeignId",
                },
                'uniqueIds': {'$addToSet': "$ad_id"},
                'count': {'$sum': 1}
                }
            },
            {'$match': { 
                'count': {"$gt": 1}
                }
            },
            {'$sort': {
                'count': -1
                }
            }
        ])
    )
duplicates_df = pd.DataFrame(duplicates)
duplicates_df.head()

In [None]:
print(duplicates_df._id.iloc[0])

In [None]:
duplicates = list(
        collection.aggregate([
            {'$group': {
                '_id': {
                    'ad_id': "$ad_id",
                },
                'uniqueIds': {'$addToSet': "$ad_id"},
                'count': {'$sum': 1}
                }
            },
            {'$match': { 
                'count': {"$gt": 1}
                }
            },
            {'$sort': {
                'count': -1
                }
            }
        ])
    )
duplicates_df = pd.DataFrame(duplicates)
duplicates_df.head()

# Connecting to MongoDB

In [None]:
import os
import sys
from random import randint

import pymongo
from dotenv import load_dotenv, find_dotenv

In [None]:
CONNECTION_STRING = os.environ.get("COSMOS_CONNECTION_STRING")
DB_NAME = os.environ.get("ADS_DB_NAME")
COLLECTION_NAME = os.environ.get("KIJIJI_COLLECTION_NAME")

In [None]:
client = pymongo.MongoClient(CONNECTION_STRING)

In [None]:
# Create database if it doesn't exist
db = client[DB_NAME]

if DB_NAME not in client.list_database_names():
    # Create a database with 400 RU throughput that can be shared across
    # the DB's collections
    db.command({"customAction": "CreateDatabase", "offerThroughput": 400})
    print("Created db '{}' with shared throughput.\n".format(DB_NAME))
else:
    print("Using database: '{}'.\n".format(DB_NAME))

In [None]:
# Create collection if it doesn't exist
collection = db[COLLECTION_NAME]
if COLLECTION_NAME not in db.list_collection_names():
    # Creates a unsharded collection that uses the DBs shared throughput
    db.command(
        {"customAction": "CreateCollection", "collection": COLLECTION_NAME}
    )
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

In [None]:
"""Query for documents in the collection"""
print("Cars with price < 20,000':\n")
results = pd.DataFrame(collection.find(
    {"price": {"$lt": 20000}}
).sort(
    "price", pymongo.ASCENDING
).limit(5))
results.head(5)

# Using/Visualizing the Data

To start using we can query like above and then convert to a pandas dataframe.

In [None]:
import plotly
import plotly.express as px

# enable notebook mode
plotly.offline.init_notebook_mode(connected=True)

In [None]:
all_ads = pd.DataFrame(collection.find())

In [None]:
all_ads.columns

In [None]:
# how many ads do we have?
print(f"No. of Ads: {len(all_ads)}")
# how many ads from each manufacturer do we have?
all_ads.make.value_counts()[0:10]

In [None]:
all_ads.describe()

In [None]:
all_ads.loc[all_ads.price > 1_000_000, ["price", "title", "make", "model", "year"]]

In [None]:
all_ads = all_ads.loc[all_ads.price < 500_000, :]
# mileage readings over a million km divide by 10
all_ads.loc[all_ads.mileage > 1_000_000, "mileage"] = all_ads.loc[all_ads.mileage > 1_000_000, "mileage"] / 10

In [None]:
all_ads = all_ads.assign(
    date_time = pd.to_datetime(all_ads.created, unit="s", origin="unix")
)
all_ads = all_ads.assign(
    year = all_ads.date_time.dt.year,
    month = all_ads.date_time.dt.month,
)

In [None]:
# plot the distribution of prices
fig = px.histogram(
    all_ads, 
    x="price", 
    # color="make",
    nbins=100,
).update_layout(
    title="Distribution of Prices",
    xaxis_title="Price",
    yaxis_title="Count",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    ),
)

fig.show()

In [None]:
all_ads.loc[all_ads.mileage > 500_000, ["mileage", "title", "make", "model", "year"]].head()

In [None]:
len(all_ads.loc[all_ads.mileage < 1000, ["mileage", "title", "make", "model", "year", "condition"]])

In [None]:
# plot the distribution of prices
fig = px.histogram(
    all_ads.loc[(all_ads.mileage < 500_000) & (all_ads.mileage > 10), :], 
    x="mileage", 
    # color="make",
    nbins=200,
).update_layout(
    title="Distribution of Mileages",
    xaxis_title="Odometer Reading",
    yaxis_title="Count",
    # xaxis_range=[0, 500_000],
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    ),
)

fig.show()

In [None]:
# plot the distribution of prices
fig = px.scatter(
    all_ads, 
    x="year", 
    y = "price",
    color="make",
).update_layout(
    title="Distribution of Price",
    xaxis_title="Year",
    yaxis_title="Price (CAD)",
    # xaxis_range=[0, 500_000],
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    ),
)

fig.show()

In [None]:
data_list = [
    df.loc[df.category == cat, "price"].tolist() 
    for cat in df.category.unique() 
    if len(df.loc[df.category == cat, "price"]) > 5
]
cat_list = [
    cat
    for cat in df.category.unique() 
    if len(df.loc[df.category == cat, "price"]) > 5
]

# # Create distplot with custom bin_size
fig = ff.create_distplot(data_list, cat_list)
fig.show()

In [None]:
import plotly.figure_factory as ff
import numpy as np

# Add histogram data
x1 = np.random.randn(200) - 2
x2 = np.random.randn(200)
x3 = np.random.randn(200) + 2
x4 = np.random.randn(200) + 4

# Group data together
hist_data = [x1, x2, x3, x4]

group_labels = ['Group 1', 'Group 2', 'Group 3', 'Group 4']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)
fig.show()

# INserting DOcuments

In [None]:
# to_be_inserted = []
# for i in range(0,1):
    
#     ad =  j["listings"]["items"][i]

#     parsed_ad = parse_single_vehicle_ad(ad)

#     # parsed_ad["price"] = 12345

#     # check if ad already exists
#     if collection.find_one({"ad_id" : parsed_ad["ad_id"]}):
        
#         # if it does exist, check if prices have changed
#         existing_ad = collection.find_one({"ad_id" : parsed_ad["ad_id"]})
#         if existing_ad["price"] != parsed_ad["price"]:
#             upda
#     else:
#         to_be_inserted.append(parsed_ad)

# print(len(to_be_inserted))

# if len(to_be_inserted) > 0:
#     collection.insert_many(to_be_inserted)
#     print(f"Successfully inserted {len(to_be_inserted)} ads into the database.")
#     to_be_inserted = []

In [None]:
# db.kjAds.create_index([("ad_id", pymongo.DESCENDING)], unique=True)