In [85]:
import os
import pandas as pd
from openai import OpenAI
from src.utils.UsefulPaths import Paths
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [86]:
paths = Paths()

In [87]:
df_subsectors = pd.read_csv(os.path.join(paths.data_processed, 'subsector.csv'))
df_subsectors = df_subsectors.fillna('')
df_subsectors = df_subsectors.loc[:, ['subsector', 'definition', 'keywords', 'does_include', 'does_not_include']].copy()
df_subsectors

Unnamed: 0,subsector,definition,keywords,does_include,does_not_include
0,Artificial Intelligence; Big Data and Analytics,Artificial Intelligence companies offer produc...,"Automated intelligence, assisted intelligence,...",,
1,Advanced Manufacturing and Robotics,"In comparison to Traditional Manufacturing, Ad...","3d printing, industrial IoT, internet of thin...","Autonomous driving, Industrial robots, Industr...",Traditional manufacturing machines
2,Clean Technology,Cleantech or clean technology is an umbrella t...,"Clean energy, and other forms of environmental...",,"oil and gas, petrochemicals"
3,Financial Technology,Describes a business that aims at providing fi...,"Insurance Tech, Risk Management, Trading, Port...",,"Brick & Mortar banks, Old brick and mortar Ins..."
4,Blockchain,Companies that develop applications using bloc...,"Distributed ledgers, Digital Mining,Cryptocurr...",,
5,Cybersecurity,"Cybersecurity is the body of technologies, pro...","cyber security, network security, data securit...","Application security, Information security, Ne...",
6,Agriculture Technology,Technologies to help the agriculture industry ...,"agritech, crop optimization, farming automatio...",Precision agriculture - PA is an approach to f...,"Farms, Vineyards, Coffee roasters, Beverages"
7,New Food,New Food includes technologies that can be lev...,"artificial meat, Alternative protein, Plant-ba...",,"alt- proteins based skincare products, cannabi..."
8,Advertising Technology,Advertising technology - different types of an...,"Conversion/optimization, Email marketing, Mobi...",,Companies whose products and services are not ...
9,Blue Economy,"Blue economy is the ""sustainable use of ocean ...","ocean sustainability, aquaculture, seafloor ma...",,


In [88]:
df_abstract_patents = pd.read_csv(os.path.join(paths.data_processed, 'abstract_patents_sliced_1000.csv'))
df_abstract_patents = df_abstract_patents.loc[0:99, ['abstract']].copy()
df_abstract_patents

Unnamed: 0,abstract
0,"A method, and a mobile device adapted thereto,..."
1,A dumbbell with a selectable number of weight ...
2,A virtual assistant AI system that may be conn...
3,Connector assemblies that are separate from me...
4,"An LED tube lamp, comprising a lamp tube, whic..."
...,...
95,A filter mask for use in a flow cytometer incl...
96,A method for controlling a crane component of ...
97,A camera head is provided that includes a firs...
98,An organic light emitting diode display includ...


In [89]:
pre_prompt = 'Follow are several subsectors and their definitions'
for index, row in df_subsectors.iterrows():
  name = row['subsector']
  definition = row['definition']
  keywords = row['keywords']
  pre_prompt = pre_prompt + f'\n- {name}: {definition}. The keywords are: {keywords}'
print(pre_prompt)

Follow are several subsectors and their definitions
- Artificial Intelligence; Big Data and Analytics: Artificial Intelligence companies offer products and services modeled on computer systems that simulate human cognition. These systems can sense their environment, learn, think, and respond to stimuli in a way similar to humans.Big Data and Analytics as a sub-sector refers to companies that provide solutions with the core function of identifying patterns and trends from large volumes / sets of data that cannot be processed by traditional database and analysis software.. The keywords are: Automated intelligence, assisted intelligence, augmented Intelligence, autonomous intelligence, supervised learning, unsupervised learning, adaptive systems, computer vision, natural language processing, natural language generation, text analytics, speech recognition, semantics technology, decision management, virtual agents, robotic process automation, machine learning, autonomous vehicle, smart robo

In [90]:
prompts = []
for index, row in df_abstract_patents.iterrows():
  patent_abstract = row['abstract']
  prompt = pre_prompt + f'\nTask: Classify this patent abstract "{patent_abstract}" in one subsector according to the definitions. Your answer need to be the full name of the subsector and the explanation like Artificial Intelligence; Big Data and Analytics: Explanation Here'
  prompts.append(prompt)

df = pd.DataFrame({'prompt': prompts})

df_abstract_patents = pd.concat([df_abstract_patents, df], axis=1)
df_abstract_patents

Unnamed: 0,abstract,prompt
0,"A method, and a mobile device adapted thereto,...",Follow are several subsectors and their defini...
1,A dumbbell with a selectable number of weight ...,Follow are several subsectors and their defini...
2,A virtual assistant AI system that may be conn...,Follow are several subsectors and their defini...
3,Connector assemblies that are separate from me...,Follow are several subsectors and their defini...
4,"An LED tube lamp, comprising a lamp tube, whic...",Follow are several subsectors and their defini...
...,...,...
95,A filter mask for use in a flow cytometer incl...,Follow are several subsectors and their defini...
96,A method for controlling a crane component of ...,Follow are several subsectors and their defini...
97,A camera head is provided that includes a firs...,Follow are several subsectors and their defini...
98,An organic light emitting diode display includ...,Follow are several subsectors and their defini...


In [91]:
client = OpenAI(api_key='', organization='')

2023-11-10 18:07:26,262 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2023-11-10 18:07:26,262 - DEBUG - load_verify_locations cafile='C:\\Users\\Thiago\\PycharmProjects\\genome\\venv\\Lib\\site-packages\\certifi\\cacert.pem'


In [92]:
patent_class = []
patent_explanation = []
for index, row in df_abstract_patents.iterrows():
  prompt = row['prompt']
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": prompt}]
  )
  class_sub, explanation = response.choices[0].message.content.split(":", 1)
  patent_class.append(class_sub)
  patent_explanation.append(explanation)
  

df = pd.DataFrame({'class': patent_class, 'explanation': patent_explanation})
df_abstract_patents = pd.concat([df_abstract_patents, df], axis=1)
df_abstract_patents

2023-11-10 18:07:26,430 - DEBUG - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'Follow are several subsectors and their definitions\n- Artificial Intelligence; Big Data and Analytics: Artificial Intelligence companies offer products and services modeled on computer systems that simulate human cognition. These systems can sense their environment, learn, think, and respond to stimuli in a way similar to humans.Big Data and Analytics as a sub-sector refers to companies that provide solutions with the core function of identifying patterns and trends from large volumes / sets of data that cannot be processed by traditional database and analysis software.. The keywords are: Automated intelligence, assisted intelligence, augmented Intelligence, autonomous intelligence, supervised learning, unsupervised learning, adaptive systems, computer vision, natural language processing, natural language generation,

Unnamed: 0,abstract,prompt,class,explanation
0,"A method, and a mobile device adapted thereto,...",Follow are several subsectors and their defini...,Artificial Intelligence; Big Data and Analytics,This patent abstract falls under the subsecto...
1,A dumbbell with a selectable number of weight ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a dumbbell wit...
2,A virtual assistant AI system that may be conn...,Follow are several subsectors and their defini...,Artificial Intelligence; Big Data and Analytics,The patent abstract describes a virtual assis...
3,Connector assemblies that are separate from me...,Follow are several subsectors and their defini...,Medical Technology; Medical devices,This patent abstract describes connector asse...
4,"An LED tube lamp, comprising a lamp tube, whic...",Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes an LED tube la...
...,...,...,...,...
95,A filter mask for use in a flow cytometer incl...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a filter mask ...
96,A method for controlling a crane component of ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,The patent abstract describes a method for co...
97,A camera head is provided that includes a firs...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a camera head ...
98,An organic light emitting diode display includ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes an organic lig...


In [95]:
df_abstract_patents.to_excel(os.path.join(paths.data_processed, 'predict.xlsx'), index=False)
df_abstract_patents.to_csv(os.path.join(paths.data_processed, 'predict.csv'), index=False)

In [96]:
df_abstract_patents['predict'] = df_abstract_patents['class']

In [97]:
df_abstract_patents

Unnamed: 0,abstract,prompt,class,explanation,predict
0,"A method, and a mobile device adapted thereto,...",Follow are several subsectors and their defini...,Artificial Intelligence; Big Data and Analytics,This patent abstract falls under the subsecto...,Artificial Intelligence; Big Data and Analytics
1,A dumbbell with a selectable number of weight ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a dumbbell wit...,Advanced Manufacturing and Robotics
2,A virtual assistant AI system that may be conn...,Follow are several subsectors and their defini...,Artificial Intelligence; Big Data and Analytics,The patent abstract describes a virtual assis...,Artificial Intelligence; Big Data and Analytics
3,Connector assemblies that are separate from me...,Follow are several subsectors and their defini...,Medical Technology; Medical devices,This patent abstract describes connector asse...,Medical Technology; Medical devices
4,"An LED tube lamp, comprising a lamp tube, whic...",Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes an LED tube la...,Advanced Manufacturing and Robotics
...,...,...,...,...,...
95,A filter mask for use in a flow cytometer incl...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a filter mask ...,Advanced Manufacturing and Robotics
96,A method for controlling a crane component of ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,The patent abstract describes a method for co...,Advanced Manufacturing and Robotics
97,A camera head is provided that includes a firs...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a camera head ...,Advanced Manufacturing and Robotics
98,An organic light emitting diode display includ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes an organic lig...,Advanced Manufacturing and Robotics


In [102]:
sub_names = df_subsectors.subsector.unique()
for index, row in df_abstract_patents.iterrows():
  patent_class = row['class']
  if patent_class not in sub_names or patent_class == 'Others':
    df_abstract_patents.iat[index, 2] = 'others'
df_abstract_patents

Unnamed: 0,abstract,prompt,class,explanation,predict
0,"A method, and a mobile device adapted thereto,...",Follow are several subsectors and their defini...,Artificial Intelligence; Big Data and Analytics,This patent abstract falls under the subsecto...,Artificial Intelligence; Big Data and Analytics
1,A dumbbell with a selectable number of weight ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a dumbbell wit...,Advanced Manufacturing and Robotics
2,A virtual assistant AI system that may be conn...,Follow are several subsectors and their defini...,Artificial Intelligence; Big Data and Analytics,The patent abstract describes a virtual assis...,Artificial Intelligence; Big Data and Analytics
3,Connector assemblies that are separate from me...,Follow are several subsectors and their defini...,Medical Technology; Medical devices,This patent abstract describes connector asse...,Medical Technology; Medical devices
4,"An LED tube lamp, comprising a lamp tube, whic...",Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes an LED tube la...,Advanced Manufacturing and Robotics
...,...,...,...,...,...
95,A filter mask for use in a flow cytometer incl...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a filter mask ...,Advanced Manufacturing and Robotics
96,A method for controlling a crane component of ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,The patent abstract describes a method for co...,Advanced Manufacturing and Robotics
97,A camera head is provided that includes a firs...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes a camera head ...,Advanced Manufacturing and Robotics
98,An organic light emitting diode display includ...,Follow are several subsectors and their defini...,Advanced Manufacturing and Robotics,This patent abstract describes an organic lig...,Advanced Manufacturing and Robotics


In [114]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y = df_abstract_patents['class']
y_pred = df_abstract_patents['predict']

# None, micro, macro, weighted
metric = 'macro'

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average=metric)
recall = recall_score(y, y_pred, average=metric)
f1 = f1_score(y, y_pred, average=metric)
cm = confusion_matrix(y, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)

Accuracy: 0.91
Precision: 0.6111111111111112
Recall: 0.6111111111111112
F1 Score: 0.6111111111111112
Confusion Matrix:
 [[61  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  8  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
