In [1]:
import os
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from src.utils.UsefulPaths import Paths
from src.utils.SpacyUtils import SpacyUtil
import numpy as np
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
paths = Paths()

In [3]:
df_subsectors = pd.read_csv(os.path.join(paths.data_processed, 'subsector.csv'))
df_subsectors = df_subsectors.fillna('')

df_subsectors = df_subsectors[:-1]

In [4]:
df_subsectors

Unnamed: 0,subsector,definition,keywords,old_name,does_include,does_not_include,definition_preprocessed,keywords_preprocessed,does_include_preprocessed,does_not_include_preprocessed,token_definition,token_keywords,token_does_include,token_does_not_include
0,Artificial Intelligence; Big Data and Analytics,Artificial Intelligence companies offer produc...,"Automated intelligence, assisted intelligence,...","Artificial Intelligence, Big Data and Analytics",,,artificial intelligence company offer product ...,automated intelligence assist intelligence aug...,,,73,62,0,0
1,Advanced Manufacturing and Robotics,"In comparison to Traditional Manufacturing, Ad...","3d printing, industrial IoT, internet of thin...",Advanced Manufacturing and Robotics,"Autonomous driving, Industrial robots, Industr...",Traditional manufacturing machines,in comparison to traditional manufacturing adv...,3d printing industrial iot internet of thing...,autonomous driving industrial robot industrial...,traditional manufacturing machine,29,21,27,3
2,Clean Technology,Cleantech or clean technology is an umbrella t...,"Clean energy, and other forms of environmental...",Cleantech,,"oil and gas, petrochemicals",cleantech or clean technology be an umbrella t...,clean energy and other form of environmental a...,,oil and gas petrochemical,67,30,0,4
3,Financial Technology,Describes a business that aims at providing fi...,"Insurance Tech, Risk Management, Trading, Port...",Fintech,,"Brick & Mortar banks, Old brick and mortar Ins...",describe a business that aim at provide financ...,insurance tech risk management trading portfol...,,brick mortar bank old brick and mortar insuran...,65,21,0,9
4,Blockchain,Companies that develop applications using bloc...,"Distributed ledgers, Digital Mining,Cryptocurr...",Blockchain,,,company that develop application use blockchai...,distribute ledger digital mining cryptocurrenc...,,,52,26,0,0
5,Cybersecurity,"Cybersecurity is the body of technologies, pro...","cyber security, network security, data securit...",Cybersecurity,"Application security, Information security, Ne...",,cybersecurity be the body of technology proces...,cyber security network security data security ...,application security information security netw...,,42,15,17,0
6,Agriculture Technology,Technologies to help the agriculture industry ...,"agritech, crop optimization, farming automatio...",Agtech,Precision agriculture - PA is an approach to f...,"Farms, Vineyards, Coffee roasters, Beverages",technology to help the agriculture industry to...,agritech crop optimization farming automation ...,precision agriculture pa be an approach to far...,farms vineyards coffee roaster beverages,84,9,62,5
7,New Food,New Food includes technologies that can be lev...,"artificial meat, Alternative protein, Plant-ba...",New Food,,"alt- proteins based skincare products, cannabi...",new food include technology that can be levera...,artificial meat alternative protein plant base...,,alt- protein base skincare product cannabis re...,63,52,0,27
8,Advertising Technology,Advertising technology - different types of an...,"Conversion/optimization, Email marketing, Mobi...",Adtech,,Companies whose products and services are not ...,advertising technology different type of analy...,conversion optimization email marketing mobile...,,company whose product and service be not focus...,50,38,0,54
9,Blue Economy,"Blue economy is the ""sustainable use of ocean ...","ocean sustainability, aquaculture, seafloor ma...",Blue Economy,,,blue economy be the sustainable use of ocean r...,ocean sustainability aquaculture seafloor mapp...,,,53,27,0,0


In [5]:
all_subsectors_names = df_subsectors['subsector'].tolist()
all_subsectors_names = ''.join(all_subsectors_names)
all_subsectors_names = all_subsectors_names.replace(';', ',')
all_subsectors_names

'Artificial Intelligence, Big Data and AnalyticsAdvanced Manufacturing and RoboticsClean TechnologyFinancial TechnologyBlockchainCybersecurityAgriculture TechnologyNew FoodAdvertising TechnologyBlue EconomyDigital MediaGamingAugmented Reality, Virtual RealityEducational TechnologyIndustry 4.0Biopharmaceutical, BiotechonologyMedical Technology, Medical devices'

In [6]:
df_abstract_patents = pd.read_csv(os.path.join(paths.data_processed, 'abstract_patents_sliced_15000.csv'))

In [7]:
slice_number = 5
df_sliced = df_abstract_patents.loc[:slice_number, ['abstract']].copy()

spacy_util = SpacyUtil(model='en_core_web_sm', lemma=True, remove_stopwords=True, lower=True, remove_numbers=False)

df_sliced['sentence_abstract'] = df_sliced['abstract'].apply(spacy_util.preprocess_text)
# df_sliced['sentence_abstract'] = 'What subsector definition best describe this patent abstract "' + df_sliced['sentence_abstract'] + '"'

df_sliced['sentence_keywords'] = df_sliced['abstract'].apply(lambda text: spacy_util.extract_keywords(text=text, max_keywords=100, use_noun=True, use_adj=True, use_verb=True))
# df_sliced['sentence_keywords'] = 'This keywords  "' + df_sliced['sentence_keywords'] + '" describe which of these types of subsectors "' + all_subsectors_names + '"'

df_sliced

Unnamed: 0,abstract,sentence_abstract,sentence_keywords
0,"A method, and a mobile device adapted thereto,...",method mobile device adapt thereto verify user...,"user, input, instruction, method, mobile, devi..."
1,A dumbbell with a selectable number of weight ...,dumbbell selectable number weight disk include...,"weight, disk, handle, connect, arrangement, se..."
2,A virtual assistant AI system that may be conn...,virtual assistant ai system connect wide varie...,"account, change, virtual, assistant, user, sys..."
3,Connector assemblies that are separate from me...,connector assembly separate medical lead exten...,"lead, connector, extension, assembly, include,..."
4,"An LED tube lamp, comprising a lamp tube, whic...",led tube lamp comprise lamp tube include light...,"light, portion, lead, include, reinforce, conn..."
5,A packaged semiconductor device includes an in...,packaged semiconductor device include insulati...,"circuit, communication, material, semiconducto..."


In [8]:
# all-mpnet-base-v2
# msmarco-distilbert-dot-v5
# distilbert-base-uncased
# all-MiniLM-L6-v2
# all-MiniLM-L12-v2
# microsoft/mpnet-base
embedder = SentenceTransformer('all-mpnet-base-v2')

2023-11-09 20:21:06,354 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-11-09 20:21:09,126 - INFO - Use pytorch device: cpu


In [9]:
max_tokens = embedder.get_max_seq_length()
print(f"Maximum tokens allowed: {max_tokens}")

Maximum tokens allowed: 384


In [10]:
def get_corpus_id_from_element(semantic_list, i):
    return semantic_list[i].get('corpus_id')

In [11]:
df_semantic_results = pd.DataFrame({'abstract': df_sliced.abstract})

In [12]:
# Definition semantic search
sentences = list(df_sliced.sentence_abstract)
sentences_embeddings = embedder.encode(sentences, normalize_embeddings=True)

corpus = list(df_subsectors['definition_preprocessed'])
corpus_embeddings = embedder.encode(corpus, normalize_embeddings=True)

# Because using normalized vectors, dot_score is a better function instead of cos_sim
df = pd.DataFrame({'definition_preprocessed': util.semantic_search(sentences_embeddings, corpus_embeddings, top_k=2, score_function=util.dot_score)})
    
df['highest_0'] = df['definition_preprocessed'].apply(lambda x: get_corpus_id_from_element(x, 0))
df['highest_1'] = df['definition_preprocessed'].apply(lambda x: get_corpus_id_from_element(x, 1))
    
corpus_id_to_subsector = pd.Series(df_subsectors.subsector.values, index=df_subsectors.index).to_dict()
    
df[f'definition_1'] = df['highest_0'].map(corpus_id_to_subsector)
df[f'definition_2'] = df['highest_1'].map(corpus_id_to_subsector)
    
df = df.drop(['highest_0', 'highest_1'], axis=1)
    
df_semantic_results = pd.concat([df_semantic_results, df], axis=1)

df

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,definition_preprocessed,definition_1,definition_2
0,"[{'corpus_id': 4, 'score': 0.3175324499607086}...",Blockchain,Digital Media
1,"[{'corpus_id': 16, 'score': 0.1903858929872512...",Medical Technology; Medical devices,Blockchain
2,"[{'corpus_id': 0, 'score': 0.5081636905670166}...",Artificial Intelligence; Big Data and Analytics,Industry 4.0
3,"[{'corpus_id': 16, 'score': 0.3097402155399322...",Medical Technology; Medical devices,Advanced Manufacturing and Robotics
4,"[{'corpus_id': 13, 'score': 0.2711642384529114...",Educational Technology,Blockchain
5,"[{'corpus_id': 1, 'score': 0.3054541349411011}...",Advanced Manufacturing and Robotics,Digital Media


In [13]:
# Keywords semantic search
sentences = list(df_sliced.sentence_keywords)
sentences_embeddings = embedder.encode(sentences, normalize_embeddings=True)

corpus = list(df_subsectors['keywords_preprocessed'])
corpus_embeddings = embedder.encode(corpus, normalize_embeddings=True)

# Because using normalized vectors, dot_score is a better function instead of cos_sim
df = pd.DataFrame({'keywords_preprocessed': util.semantic_search(sentences_embeddings, corpus_embeddings, top_k=2, score_function=util.dot_score)})
    
df['highest_0'] = df['keywords_preprocessed'].apply(lambda x: get_corpus_id_from_element(x, 0))
df['highest_1'] = df['keywords_preprocessed'].apply(lambda x: get_corpus_id_from_element(x, 1))
    
corpus_id_to_subsector = pd.Series(df_subsectors.subsector.values, index=df_subsectors.index).to_dict()
    
df[f'keywords_1'] = df['highest_0'].map(corpus_id_to_subsector)
df[f'keywords_2'] = df['highest_1'].map(corpus_id_to_subsector)
    
df = df.drop(['highest_0', 'highest_1'], axis=1)
    
df_semantic_results = pd.concat([df_semantic_results, df], axis=1)

df

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,keywords_preprocessed,keywords_1,keywords_2
0,"[{'corpus_id': 0, 'score': 0.4092501997947693}...",Artificial Intelligence; Big Data and Analytics,Cybersecurity
1,"[{'corpus_id': 7, 'score': 0.3395450711250305}...",New Food,Artificial Intelligence; Big Data and Analytics
2,"[{'corpus_id': 0, 'score': 0.3899620771408081}...",Artificial Intelligence; Big Data and Analytics,New Food
3,"[{'corpus_id': 7, 'score': 0.39835429191589355...",New Food,Medical Technology; Medical devices
4,"[{'corpus_id': 7, 'score': 0.33039551973342896...",New Food,Medical Technology; Medical devices
5,"[{'corpus_id': 7, 'score': 0.34602975845336914...",New Food,Advanced Manufacturing and Robotics


In [14]:
df_semantic_results

Unnamed: 0,abstract,definition_preprocessed,definition_1,definition_2,keywords_preprocessed,keywords_1,keywords_2
0,"A method, and a mobile device adapted thereto,...","[{'corpus_id': 4, 'score': 0.3175324499607086}...",Blockchain,Digital Media,"[{'corpus_id': 0, 'score': 0.4092501997947693}...",Artificial Intelligence; Big Data and Analytics,Cybersecurity
1,A dumbbell with a selectable number of weight ...,"[{'corpus_id': 16, 'score': 0.1903858929872512...",Medical Technology; Medical devices,Blockchain,"[{'corpus_id': 7, 'score': 0.3395450711250305}...",New Food,Artificial Intelligence; Big Data and Analytics
2,A virtual assistant AI system that may be conn...,"[{'corpus_id': 0, 'score': 0.5081636905670166}...",Artificial Intelligence; Big Data and Analytics,Industry 4.0,"[{'corpus_id': 0, 'score': 0.3899620771408081}...",Artificial Intelligence; Big Data and Analytics,New Food
3,Connector assemblies that are separate from me...,"[{'corpus_id': 16, 'score': 0.3097402155399322...",Medical Technology; Medical devices,Advanced Manufacturing and Robotics,"[{'corpus_id': 7, 'score': 0.39835429191589355...",New Food,Medical Technology; Medical devices
4,"An LED tube lamp, comprising a lamp tube, whic...","[{'corpus_id': 13, 'score': 0.2711642384529114...",Educational Technology,Blockchain,"[{'corpus_id': 7, 'score': 0.33039551973342896...",New Food,Medical Technology; Medical devices
5,A packaged semiconductor device includes an in...,"[{'corpus_id': 1, 'score': 0.3054541349411011}...",Advanced Manufacturing and Robotics,Digital Media,"[{'corpus_id': 7, 'score': 0.34602975845336914...",New Food,Advanced Manufacturing and Robotics


In [15]:
predict = []
others_threshold = 0.45
for index, row in df_semantic_results.iterrows():
    definition = {row['definition_1'], row['definition_2']}
    keywords = {row['keywords_1'], row['keywords_2']}
    intersection = definition & keywords
    if len(intersection) > 0:
        predict.append(intersection.pop())
    else:
        definition_best_score = row['definition_preprocessed'][0].get('score')
        keywords_best_score = row['keywords_preprocessed'][0].get('score')
        if definition_best_score < others_threshold and keywords_best_score < others_threshold:
            predict.append('Others')
        else:
            if definition_best_score >= keywords_best_score:
                predict.append(row['definition_1'])
            else:
                predict.append(row['keywords_1'])
    
df_class = pd.DataFrame({'class': predict})

df_semantic_results = pd.concat([df_semantic_results, df_class], axis=1)       
df_semantic_results
        

Unnamed: 0,abstract,definition_preprocessed,definition_1,definition_2,keywords_preprocessed,keywords_1,keywords_2,class
0,"A method, and a mobile device adapted thereto,...","[{'corpus_id': 4, 'score': 0.3175324499607086}...",Blockchain,Digital Media,"[{'corpus_id': 0, 'score': 0.4092501997947693}...",Artificial Intelligence; Big Data and Analytics,Cybersecurity,Others
1,A dumbbell with a selectable number of weight ...,"[{'corpus_id': 16, 'score': 0.1903858929872512...",Medical Technology; Medical devices,Blockchain,"[{'corpus_id': 7, 'score': 0.3395450711250305}...",New Food,Artificial Intelligence; Big Data and Analytics,Others
2,A virtual assistant AI system that may be conn...,"[{'corpus_id': 0, 'score': 0.5081636905670166}...",Artificial Intelligence; Big Data and Analytics,Industry 4.0,"[{'corpus_id': 0, 'score': 0.3899620771408081}...",Artificial Intelligence; Big Data and Analytics,New Food,Artificial Intelligence; Big Data and Analytics
3,Connector assemblies that are separate from me...,"[{'corpus_id': 16, 'score': 0.3097402155399322...",Medical Technology; Medical devices,Advanced Manufacturing and Robotics,"[{'corpus_id': 7, 'score': 0.39835429191589355...",New Food,Medical Technology; Medical devices,Medical Technology; Medical devices
4,"An LED tube lamp, comprising a lamp tube, whic...","[{'corpus_id': 13, 'score': 0.2711642384529114...",Educational Technology,Blockchain,"[{'corpus_id': 7, 'score': 0.33039551973342896...",New Food,Medical Technology; Medical devices,Others
5,A packaged semiconductor device includes an in...,"[{'corpus_id': 1, 'score': 0.3054541349411011}...",Advanced Manufacturing and Robotics,Digital Media,"[{'corpus_id': 7, 'score': 0.34602975845336914...",New Food,Advanced Manufacturing and Robotics,Advanced Manufacturing and Robotics


In [16]:
df_semantic_results.loc[:, ['abstract', 'class']]

Unnamed: 0,abstract,class
0,"A method, and a mobile device adapted thereto,...",Others
1,A dumbbell with a selectable number of weight ...,Others
2,A virtual assistant AI system that may be conn...,Artificial Intelligence; Big Data and Analytics
3,Connector assemblies that are separate from me...,Medical Technology; Medical devices
4,"An LED tube lamp, comprising a lamp tube, whic...",Others
5,A packaged semiconductor device includes an in...,Advanced Manufacturing and Robotics


In [20]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

2023-11-09 20:22:00,944 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json HTTP/1.1" 200 0
2023-11-09 20:22:00,946 - DEBUG - Attempting to acquire lock 1516170758096 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\f4989f072a7f517d01d479eb1685c6e50e014f88.lock
2023-11-09 20:22:00,947 - DEBUG - Lock 1516170758096 acquired on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\f4989f072a7f517d01d479eb1685c6e50e014f88.lock
2023-11-09 20:22:01,121 - DEBUG - https://huggingface.co:443 "GET /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json HTTP/1.1" 200 571


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
2023-11-09 20:22:01,129 - DEBUG - Attempting to release lock 1516170758096 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\f4989f072a7f517d01d479eb1685c6e50e014f88.lock
2023-11-09 20:22:01,129 - DEBUG - Lock 1516170758096 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\f4989f072a7f517d01d479eb1685c6e50e014f88.lock
2023-11-09 20:22:01,304 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/model.safetensors HTTP/1.1" 404 0
2023-11-09 20:22:01,475 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/model.safetensors.index.json HTTP/1.1" 404 0
2023-11-09 

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

2023-11-09 20:22:01,994 - DEBUG - Attempting to release lock 1518264021200 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\520ee2fc9a8659d53be1f9f0a4502c151fc24775.lock
2023-11-09 20:22:01,995 - DEBUG - Lock 1518264021200 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\520ee2fc9a8659d53be1f9f0a4502c151fc24775.lock


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

2023-11-09 20:22:02,166 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00001-of-00002.bin HTTP/1.1" 302 0
2023-11-09 20:22:02,168 - DEBUG - Attempting to acquire lock 1518431138128 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\2f237251ac3ecb3bcbd8978b3eb7b55b5e83c06cd5224b276d2d8462773488c8.lock
2023-11-09 20:22:02,168 - DEBUG - Lock 1518431138128 acquired on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\2f237251ac3ecb3bcbd8978b3eb7b55b5e83c06cd5224b276d2d8462773488c8.lock
2023-11-09 20:22:02,170 - DEBUG - Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443
2023-11-09 20:22:02,929 - DEBUG - https://cdn-lfs.huggingface.co:443 "GET /repos/ea/00/ea00943d992c7851ad9f4f4bd094a0397fb5087e0f7cba4ef003018963ea07e3/2f237251ac3ecb3bcbd8978b3eb7b55b5e83c06cd5224b276d2d8462773488c8?response-content-disposition=attachment%3B+filename*%3DUTF-8%

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

2023-11-09 20:24:09,808 - DEBUG - Attempting to release lock 1518431138128 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\2f237251ac3ecb3bcbd8978b3eb7b55b5e83c06cd5224b276d2d8462773488c8.lock
2023-11-09 20:24:09,808 - DEBUG - Lock 1518431138128 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\2f237251ac3ecb3bcbd8978b3eb7b55b5e83c06cd5224b276d2d8462773488c8.lock
2023-11-09 20:24:09,971 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00002-of-00002.bin HTTP/1.1" 302 0
2023-11-09 20:24:09,972 - DEBUG - Attempting to acquire lock 1518435256848 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\d9e54fbaabfe2c5d00d2a642398c5959c1eb31e9d9d40c80bd5d7c2c4067d3e1.lock
2023-11-09 20:24:09,973 - DEBUG - Lock 1518435256848 acquired on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Ins

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

2023-11-09 20:25:14,353 - DEBUG - Attempting to release lock 1518435256848 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\d9e54fbaabfe2c5d00d2a642398c5959c1eb31e9d9d40c80bd5d7c2c4067d3e1.lock
2023-11-09 20:25:14,366 - DEBUG - Lock 1518435256848 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\d9e54fbaabfe2c5d00d2a642398c5959c1eb31e9d9d40c80bd5d7c2c4067d3e1.lock


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2023-11-09 20:27:17,091 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2023-11-09 20:27:17,098 - DEBUG - Attempting to acquire lock 1516170766736 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\fae21d778664a82dfdc92143da539f99e7b2309a.lock
2023-11-09 20:27:17,100 - DEBUG - Lock 1516170766736 acquired on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\fae21d778664a82dfdc92143da539f99e7b2309a.lock
2023-11-09 20:27:17,273 - DEBUG - https://huggingface.co:443 "GET /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json HTTP/1.

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

2023-11-09 20:27:17,290 - DEBUG - Attempting to release lock 1516170766736 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\fae21d778664a82dfdc92143da539f99e7b2309a.lock
2023-11-09 20:27:17,290 - DEBUG - Lock 1516170766736 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\fae21d778664a82dfdc92143da539f99e7b2309a.lock
2023-11-09 20:27:17,531 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.model HTTP/1.1" 302 0
2023-11-09 20:27:17,534 - DEBUG - Attempting to acquire lock 1518187664400 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055.lock
2023-11-09 20:27:17,536 - DEBUG - Lock 1518187664400 acquired on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

2023-11-09 20:27:17,704 - DEBUG - Attempting to release lock 1518187664400 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055.lock
2023-11-09 20:27:17,705 - DEBUG - Lock 1518187664400 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055.lock
2023-11-09 20:27:17,876 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.json HTTP/1.1" 200 0
2023-11-09 20:27:17,878 - DEBUG - Attempting to acquire lock 1518443550608 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\43e6daf936dc0f953cb867ec864adab78f92d9ce.lock
2023-11-09 20:27:17,879 - DEBUG - Lock 1518443550608 acquired on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\43e6daf936dc0f953cb867ec8

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

2023-11-09 20:27:18,798 - DEBUG - Attempting to release lock 1518443550608 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\43e6daf936dc0f953cb867ec864adab78f92d9ce.lock
2023-11-09 20:27:18,799 - DEBUG - Lock 1518443550608 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\43e6daf936dc0f953cb867ec864adab78f92d9ce.lock
2023-11-09 20:27:18,974 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/added_tokens.json HTTP/1.1" 404 0
2023-11-09 20:27:19,205 - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-Instruct-v0.1/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2023-11-09 20:27:19,209 - DEBUG - Attempting to acquire lock 1518443152912 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\a52c50a199269393cd1548c7e6a77a654bd2001b.lock
2023-11-09 20:27:19,210 - DEBUG - Lock 1518443152912 acquired on C:\

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

2023-11-09 20:27:19,401 - DEBUG - Attempting to release lock 1518443152912 on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\a52c50a199269393cd1548c7e6a77a654bd2001b.lock
2023-11-09 20:27:19,403 - DEBUG - Lock 1518443152912 released on C:\Users\Thiago/.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\blobs\a52c50a199269393cd1548c7e6a77a654bd2001b.lock
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [21]:
class_zeroshot_list = []
class_score_zeroshot_list = []
# labels = df_subsectors['subsector'].values
for index, row in df_semantic_results.iterrows():
    labels = {row['definition_1'], row['definition_2'], row['keywords_1'], row['keywords_2']}
    labels = list(labels)
    
    patent_zero_shot_class = classifier(row['abstract'], labels, multi_label=False)
    class_zeroshot_list.append(patent_zero_shot_class['labels'][0])
    class_score_zeroshot_list.append(patent_zero_shot_class['scores'][0])

df_zero_shot = pd.DataFrame({'class_zeroshot': class_zeroshot_list, 'class_zeroshot_score': class_score_zeroshot_list})
df_zero_shot

Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,class_zeroshot,class_zeroshot_score
0,Digital Media,0.7
1,Blockchain,0.75
2,New Food,0.58
3,Advanced Manufacturing and Robotics,0.67
4,Blockchain,0.53
5,Digital Media,0.66
