In [23]:
import requests
import re
import json
import pandas as pd
import os
from bs4 import BeautifulSoup
from datetime import datetime
from serpapi import GoogleSearch
from datetime import datetime, timedelta

In [24]:
def save_raw_meta(mode,start_date,end_date,data):
    if not os.path.exists('raw_metadata'):
        os.makedirs('raw_metadata')

    s_label = start_date[:4] if mode == "previous" else start_date
    e_label = end_date[:4] if mode == "previous" else end_date

    metadata_filename = f'raw_metadata/raw_metadata_{s_label}_{e_label}.json'
    with open(metadata_filename, 'w') as metadata_file:
        json.dump(data, metadata_file, indent=4)
    print(f'Raw metadata saved to {metadata_filename}')

In [25]:
def save_extracted_data(mode,start_date,end_date,data):
    
    if not os.path.exists('extracted_data'):
        os.makedirs('extracted_data')
    
    s_label = start_date[:4] if mode == "previous" else start_date
    e_label = end_date[:4] if mode == "previous" else end_date
    
    extracted_filename = f'extracted_data/extracted_data_{s_label}_{e_label}.json'
    with open(extracted_filename, 'w') as extracted_file:
        json.dump(data, extracted_file, indent=4)
    print(f'Extracted data saved to {extracted_filename}')

In [41]:
def get_para(mode,query,start_date,end_date,page_num):

    if mode == "autorun":
        #cover date for autorun
        yesterday = datetime.now() - timedelta(days=1)
        start_date = yesterday.strftime("%Y%m%d")
        end_date = yesterday.strftime("%Y%m%d")
        
    API_KEY = os.getenv("SERPAPI_KEY")
    
    if API_KEY is None:
        print("Please set the SERPAPI_KEY environment variable.")
    
    params = {
        "engine": "google_patents",
        "q": query,
        "num": "100",
        "page": page_num,
        "after": f"priority:{start_date}",
        "before": f"priority:{end_date}",
        "api_key": API_KEY
    }
    return params

In [27]:
def scrape_patent(mode,q,start_d,end_d):
    
    if mode not in ["previous", "autorun"]:
    raise ValueError("Invalid mode. Mode must be 'previous' or 'autorun'.")
    
    print("Running scraping mode: ",mode)
    page_num = 1
    raw_metadata = []
    all_extracted_data = []

    while True:
        
        params = get_para(mode,q,start_d,end_d,page_num)

        search = GoogleSearch(params)
        results = search.get_dict()
        extracted_data = []

        if results['search_metadata']['status'] == 'Success':
            raw_metadata.append(results)
            total_page = results['search_information']['total_pages']
            print(f'successfully scraped, extracting information from page {page_num} out of {total_page}')
        else:
            print('Failed to scrape data, debug and try again!')
            break

        organic_results = results["organic_results"] 

        # Extract data from each result
        for result in organic_results:
            if "country_status" in result and result["country_status"] is not None:
                active_statuses = [country for country, status in result.get("country_status").items() if status == "ACTIVE"]
                active_country = ', '.join(active_statuses)

            extracted_data.append({
                "patent_id": result.get("patent_id").split('/')[1],
                "title": result.get("title").strip(),
                "snippet": result.get("snippet").strip(),
                "grant_date": result.get("grant_date"),
                "publication_date": result.get("publication_date"),
                "inventor": result.get("inventor"),
                "assignee": result.get("assignee"),
                "language": result.get("language"),
                "active_country": active_country,
                "pdf": result.get("pdf")
            })
            
        all_extracted_data.extend(extracted_data)   
        
        if "next" in results["serpapi_pagination"]:
            page_num +=1 
        else:
            break     
            
    save_raw_meta(mode,start_d,end_d,raw_metadata)
    save_extracted_data(mode,start_d,end_d,all_extracted_data)
    
    return raw_metadata,all_extracted_data

## main code
define the keywords we want to search for 

In [28]:
general_word =[
    '"artificial intelligence"',
    '"large language model"',
    '"chatgpt"'
]

offensive_word = [
    '"prompt injection"',
    '"backdoor attack"',
    '"extraction attack"',
    '"jailbreak attack"',
    '"poisoning attack"',
    '"adversarial attack"',
    '"privacy attack"',
    '"evasion attack"',
    '"robustness attack"',
    '"deepfake"',
    '"ransomware"'
]
general_str = ' OR '.join(general_word)
offensive_str = ' OR '.join(offensive_word)
query = f"({general_str}) AND ({offensive_str})"

we have two modes for scraping: 
1. "previous" = scrape the historical data, need to specify start date and end date in "yyyymmdd" format;
2. another mode is designed for "autorun", will automatically scrape data from yesterday only(need to input start_date and end_date, but they will be automatically replaced, so can put any date you want).<br>

modify start_date and end_date if want to do "previous" mode scraping

In [36]:
start_date='20220101'
end_date='20241231'
raw_meta,extra_data = scrape_patent("previous",query,start_date,end_date)

Running scraping mode:  previous
successfully scraped, extracting information from page 1 out of 6
successfully scraped, extracting information from page 2 out of 6
successfully scraped, extracting information from page 3 out of 6
successfully scraped, extracting information from page 4 out of 6
successfully scraped, extracting information from page 5 out of 6
successfully scraped, extracting information from page 6 out of 6
Raw metadata saved to raw_metadata/raw_metadata_2022_2024.json
Extracted data saved to extracted_data/extracted_data_2022_2024.json


all the previous patent data is scraped, now combine them all together

In [38]:
#join all the file together
directory = 'extracted_data'

# Initialize an empty list to store combined data
combined_data = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        # Load the JSON file
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            data = json.load(file)
            # Append the data to the combined list
            combined_data.extend(data)

# Define the output file path
output_file = 'all_extracted_data.json'

# Save the combined data to a single JSON file
with open(output_file, 'w') as file:
    json.dump(combined_data, file, indent=4)

print(f'Combined data saved to {output_file}')

Combined data saved to all_extracted_data.json


In [46]:
csv_file = "csv_all_extracted_data.csv" # Assign .csv file name to a variable
csv_columns = [                 # Define list of columns for your .csv file
    "patent_id",
    "title",  
    "snippet", 
    "grant_date", 
    "publication_date", 
    "inventor",
    "assignee",
    "language",
    "active_country",
    "pdf"
]   


# Save all extracted data to a CSV file
pd.DataFrame(data=combined_data).to_csv(
    csv_file, 
    columns=csv_columns, 
    encoding="utf-8", 
    index=False
    )

print(f'saved combined data as csv format to {csv_file}')

saved combined data as csv format to csv_all_extracted_data.csv


In [40]:
filepath = output_file
with open(filepath, 'r') as file:
    data = json.load(file)
    # Append the data to the combined list
print(len(data))

2146
