## 1. OBJECTIVE
Classify patents into predefined subsectors based on their abstracts. Any patent that doesn't fit these subsectors will be classified as "Other."

## 2. DATA
2.1 Subsector definitions: json file containing definitions and keyworkds
2.2 Patents: csv file containing patent abstracts

In [5]:
import os
import openai
import pandas as pd

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [18]:
raw_data_dir = os.path.join('..', 'data', 'raw')
parquet_filename = 'abstract.parquet'
parquet_path = os.path.join(raw_data_dir, parquet_filename)
df_abstract = pd.read_parquet(parquet_path)

In [8]:
df_abstract.head()

Unnamed: 0,publication_number,abstract
0,20080063564,Embodiments of techniques for determining the ...
1,20080025285,A method for supporting frequency hopping of a...
2,20080056857,To correct any positional misalignment of a su...
3,20080031117,A holographic optical accessing system include...
4,20080056179,Transmitting an acknowledgement/negative ackno...


## 3. DATA SPLITTING
Not necessary at this point. We will split the data into train and test sets later on, after validating the approach

## 4. DATA EXPLORATION

In [9]:
# Counting unique publication numbers
print('{:,}'.format(df_abstract['publication_number'].nunique()))

4,184,916


In [10]:
# Min, max and mean number of characters in abstracts
print('Min characters: ', df_abstract['abstract'].str.len().min())
print('Max characters: ', df_abstract['abstract'].str.len().max())
print('Mean characters: ', round(df_abstract['abstract'].str.len().mean(),0))

Min characters:  1
Max characters:  11164
Mean characters:  674.0


In [11]:
# Visualizing the abstracts with 20 or less characters
df_abstract[df_abstract['abstract'].str.len() <= 10]

Unnamed: 0,publication_number,abstract
421554,20090232753,[A gel
426571,20090217796,[A soft
909470,20110045983,[A gel
986260,20110171940,[A system
1033571,20110190260,[[1
1213859,20120178063,[A simple
1700660,20130306496,A
1907863,20140161655,A pump.
2441796,20160011399,[A compact
2498631,20160078481,[Laws


## 5. ALGORITHMS
- Word embeddings
- Similarity measures

In [12]:
abstract_sample = df_abstract.iloc[0:100]

In [13]:
abstract_sample.shape

(100, 2)

In [3]:
import numpy as np
import pandas as pd
import json
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
# Load CSV
print(abstract_sample.head())

   publication_number                                           abstract
0         20080063564  Embodiments of techniques for determining the ...
1         20080025285  A method for supporting frequency hopping of a...
2         20080056857  To correct any positional misalignment of a su...
3         20080031117  A holographic optical accessing system include...
4         20080056179  Transmitting an acknowledgement/negative ackno...


In [23]:
processed_data_dir = os.path.join('..', 'data', 'processed')
json_filename = 'subsector_definitions_adjusted2.json'
json_path = os.path.join(processed_data_dir, json_filename)

In [24]:
print(json_path)

../data/processed/subsector_definitions_adjusted2.json


In [35]:
# Load JSON
with open(json_path, 'r') as f:
    subsectors = json.load(f)

In [41]:
abstract_sample['tokenized_abstract'] = abstract_sample['abstract'].apply(lambda x: x.split())

# Tokenize subsector descriptions
for subsector in subsectors:
    subsector['tokenized_description'] = subsector['description'].split()

In [37]:
all_text = abstract_sample['tokenized_abstract'].tolist() + [sub['tokenized_description'] for sub in subsectors]
model = Word2Vec(sentences=all_text, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [38]:
def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

# Calculate average vectors for each abstract and subsector
abstract_sample['avg_vector'] = abstract_sample['tokenized_abstract'].apply(lambda x: average_word_vectors(x, model, 100))
subsector_vectors = [average_word_vectors(sub['tokenized_description'], model, 100) for sub in subsectors]


In [39]:
similarity_matrix = []
for index, row in abstract_sample.iterrows():
    similarities = cosine_similarity([row['avg_vector']], subsector_vectors)[0]
    primary_subsector = np.argmax(similarities)
    if similarities[primary_subsector] >= 0.6:
        primary_subsector = subsectors[primary_subsector]['subsector']
    else:
        primary_subsector = 'None'
    similarity_matrix.append([row['publication_number']] + list(similarities) + [primary_subsector])

# Create DataFrame for similarity matrix
columns = ['publication_number'] + [sub['subsector_short'] for sub in subsectors] + ['Primary Subsector']
similarity_df = pd.DataFrame(similarity_matrix, columns=columns)

# Save to CSV
# similarity_df.to_csv('similarity_matrix.csv', index=False)

print(similarity_df)

    publication_number        AI  Robotics  Cleantech   Fintech  Blockchain  \
0          20080063564  0.999708  0.999530   0.999727  0.999607    0.999737   
1          20080025285  0.999783  0.999627   0.999779  0.999632    0.999847   
2          20080056857  0.999758  0.999596   0.999761  0.999620    0.999790   
3          20080031117  0.999802  0.999641   0.999799  0.999630    0.999832   
4          20080056179  0.999790  0.999598   0.999813  0.999638    0.999798   
..                 ...       ...       ...        ...       ...         ...   
95         20080018144  0.999755  0.999646   0.999778  0.999612    0.999759   
96         20080011049  0.999814  0.999648   0.999816  0.999652    0.999824   
97         20080069132  0.999831  0.999608   0.999839  0.999631    0.999811   
98         20080008627  0.999807  0.999625   0.999822  0.999663    0.999822   
99         20080011128  0.999775  0.999638   0.999803  0.999664    0.999806   

    Cybersecurity Primary Subsector  
0        0.99

In [40]:
def normalize_vector(vec):
    return vec / np.linalg.norm(vec)