## 1. OBJECTIVE
Classify patents into predefined subsectors based on their abstracts. Any patent that doesn't fit these subsectors will be classified as "Other."

## 2. DATA
2.1 Subsector definitions: json file containing definitions and keyworkds
2.2 Patents: csv file containing patent abstracts

In [1]:
import os
import openai
import pandas as pd

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

# openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
notebook_dir = os.getcwd()
raw_data_dir = os.path.join(notebook_dir, '..', 'data', 'raw')
processed_data_dir = os.path.join(notebook_dir, '..', 'data', 'processed')

In [3]:
parquet_filename = 'abstract.parquet'
parquet_path = os.path.join(raw_data_dir, parquet_filename)
df_abstract = pd.read_parquet(parquet_path)

In [4]:
df_abstract.head()

Unnamed: 0,publication_number,abstract
0,20080063564,Embodiments of techniques for determining the ...
1,20080025285,A method for supporting frequency hopping of a...
2,20080056857,To correct any positional misalignment of a su...
3,20080031117,A holographic optical accessing system include...
4,20080056179,Transmitting an acknowledgement/negative ackno...


## 3. DATA SPLITTING
Not necessary at this point. We will split the data into train and test sets later on, after validating the approach

## 4. DATA EXPLORATION

In [None]:
# Counting unique publication numbers
print('{:,}'.format(df_abstract['publication_number'].nunique()))

In [None]:
# Min, max and mean number of characters in abstracts
print('Min characters: ', df_abstract['abstract'].str.len().min())
print('Max characters: ', df_abstract['abstract'].str.len().max())
print('Mean characters: ', round(df_abstract['abstract'].str.len().mean(),0))

In [None]:
# Visualizing the abstracts with 20 or less characters
df_abstract[df_abstract['abstract'].str.len() <= 10]

## 5. ALGORITHMS
- Word embeddings
- Similarity measures

In [5]:
abstract_sample = df_abstract.iloc[0:100]

In [6]:
abstract_sample.shape

(100, 2)

In [7]:
from langchain.document_loaders import DataFrameLoader

loader_abstracts = DataFrameLoader(abstract_sample, 'abstract')

In [8]:
abstract_docs = loader_abstracts.load()

In [9]:
abstract_docs[0]

Document(page_content='Embodiments of techniques for determining the concentrations of one or more acid components of a multiple acid solution are presented herein.', metadata={'publication_number': 20080063564})

In [10]:
from langchain.embeddings import GPT4AllEmbeddings, OpenAIEmbeddings
# gpt4all_embeddings = GPT4AllEmbeddings()
openai_embeddings = OpenAIEmbeddings()
# openai_embeddings.openai_api_key = os.environ['OPENAI_API_KEY']

In [11]:
from langchain.vectorstores import Chroma
persist_directory = 'data/processed/chromadb/'
!rm -rf ./data/processed/chromadb/  # remove old database files if any

In [12]:
abstract_vectordb = Chroma.from_documents(
    documents=abstract_docs,
    # embedding=gpt4all_embeddings,
    embedding=openai_embeddings,
    persist_directory=persist_directory
)

In [None]:
print(abstract_vectordb._collection.count())

In [13]:
abstract_vectordb.get()

{'ids': ['6d69b730-46c9-11ee-90e2-acde48001122',
  '6d69b96a-46c9-11ee-90e2-acde48001122',
  '6d69b9d8-46c9-11ee-90e2-acde48001122',
  '6d69ba32-46c9-11ee-90e2-acde48001122',
  '6d69ba82-46c9-11ee-90e2-acde48001122',
  '6d69bad2-46c9-11ee-90e2-acde48001122',
  '6d69bb22-46c9-11ee-90e2-acde48001122',
  '6d69bb72-46c9-11ee-90e2-acde48001122',
  '6d69bbb8-46c9-11ee-90e2-acde48001122',
  '6d69bbfe-46c9-11ee-90e2-acde48001122',
  '6d69bc4e-46c9-11ee-90e2-acde48001122',
  '6d69bc94-46c9-11ee-90e2-acde48001122',
  '6d69bcda-46c9-11ee-90e2-acde48001122',
  '6d69bd2a-46c9-11ee-90e2-acde48001122',
  '6d69bd7a-46c9-11ee-90e2-acde48001122',
  '6d69bdb6-46c9-11ee-90e2-acde48001122',
  '6d69be06-46c9-11ee-90e2-acde48001122',
  '6d69be4c-46c9-11ee-90e2-acde48001122',
  '6d69be9c-46c9-11ee-90e2-acde48001122',
  '6d69bee2-46c9-11ee-90e2-acde48001122',
  '6d69bf32-46c9-11ee-90e2-acde48001122',
  '6d69bf78-46c9-11ee-90e2-acde48001122',
  '6d69bfbe-46c9-11ee-90e2-acde48001122',
  '6d69c00e-46c9-11ee-90e2-

In [None]:
abstract_vectordb2 = Chroma.from_documents(
    documents=abstract_docs,
    embedding=gpt4all_embeddings,
    # embedding=openai_embeddings,
    persist_directory=persist_directory
)

In [None]:
abstract_vectordb2.get()

In [None]:
print(abstract_vectordb)

In [None]:
from langchain.document_loaders import JSONLoader

In [None]:
json_filename = 'subsector_definitions_adjusted2.json'
json_path = os.path.join(processed_data_dir, json_filename)

In [None]:
print(json_path)

In [None]:
import json
from pathlib import Path
from pprint import pprint

subsectors = json.loads(Path(json_path).read_text())

In [None]:
print(subsectors[1])