**Title**: Download CPE data\
**Description**: Notebook to download all CPE data\
**Developer**: Teck Lim\
**Create date**: 03/28/2021

In [None]:
!pip install requests==2.25.1

In [None]:
import requests
import json
import time
import os

from datetime import datetime

In [None]:
drive.mount('/content/gdrive')
!dir './gdrive/Shareddrives/ucsd_drive/Data'

In [None]:
file_path = './gdrive/Shareddrives/ucsd_drive/Data/cpe.json'
base_url = 'https://services.nvd.nist.gov/rest/json/cpes/1.0'

In [None]:
def scrap_cpe(start_page=0, total_page=1000, page_size=5000, sleep_duration=3):
    """
    Scrap CPE using REST API.
    start_page: starting page to scrap
    total_page: max number of page to scrap
    page_size: number of CVE in one page
    sleep_duration: sleep time in between each REST to avoid denial of service
    """
    cpe_items = list()
    for page_no in range(start_page, total_page):
        for _ in range(5):
            try:
                print('Retrieving page: {}'.format(page_no+1))
                url = '{}?startIndex={}&resultsPerPage={}'.format(base_url, page_no*page_size, page_size)
                response = requests.get(url)
                response_json = response.json()
                break
            except:
                print('Something is wrong. Sleep for {} sec before retrying'.format(sleep_duration))
                time.sleep(sleep_duration)
        else:
            raise BaseException('Exhausted all attempts')

        cpe_items += response_json['result']['cpes']          
        print('Total scrapped: {}'.format(len(cpe_items)))
        if len(cpe_items) == response_json['totalResults']:
            print('Completed scrapping..')
            break
        time.sleep(sleep_duration)
    return cpe_items

In [None]:
cpe_list = scrap_cpe()
print('Total unique CPE: {}'.format(len(cpe_list)))

In [None]:
with open(file_path, 'w') as fp:
    fp.write(json.dumps(cpe_list))