# SLR Project
- Author: Ambreen Hanif
- Date: 28/07/2023


In [3]:
import hdbscan
import numpy as np
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
# Import Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os 
# for visualizations
import plotly.express as px

# import transformers
import torch
import tqdm as notebook_tqdm

from transformers import AutoTokenizer, AutoModel

import logging

In [5]:
os.environ['CURL_CA_BUNDLE'] = ''  # to avoid hugging face ssl error.

In [6]:
logging.basicConfig(level=logging.INFO) # to get the log of the model info

# Read Data File

In [7]:
data = pd.read_csv('my_zotero_library.csv', sep=',')
data.columns= data.columns.str.lower()
data.columns = data.columns.str.replace(' ','_')
data.columns

Index(['key', 'item_type', 'publication_year', 'author', 'title',
       'publication_title', 'isbn', 'issn', 'doi', 'url', 'abstract_note',
       'date', 'date_added', 'date_modified', 'access_date', 'pages',
       'num_pages', 'issue', 'volume', 'number_of_volumes',
       'journal_abbreviation', 'short_title', 'series', 'series_number',
       'series_text', 'series_title', 'publisher', 'place', 'language',
       'rights', 'type', 'archive', 'archive_location', 'library_catalog',
       'call_number', 'extra', 'notes', 'file_attachments', 'link_attachments',
       'manual_tags', 'automatic_tags', 'editor', 'series_editor',
       'translator', 'contributor', 'attorney_agent', 'book_author',
       'cast_member', 'commenter', 'composer', 'cosponsor', 'counsel',
       'interviewer', 'producer', 'recipient', 'reviewed_author',
       'scriptwriter', 'words_by', 'guest', 'number', 'edition',
       'running_time', 'scale', 'medium', 'artwork_size', 'filing_date',
       'applicatio

In [8]:
data['item_type'].unique()
data.shape

(948, 87)

As this is an SLR. 
I will filter 
JournalArticles, conferencepaper, book, thesis and preprints

In [9]:
data['item_type'].unique()

array(['webpage', 'journalArticle', 'preprint', 'conferencePaper', 'book',
       'presentation', 'report', 'newspaperArticle', 'computerProgram',
       'magazineArticle', 'blogPost', 'document', 'bookSection', 'thesis'],
      dtype=object)

In [10]:
filtered_data = data.loc[data['item_type'].isin(['journalArticle','conferencePaper','book','thesis','preprints'])]
filtered_data.shape

(596, 87)

In [11]:
filtered_data.head(5)

Unnamed: 0,key,item_type,publication_year,author,title,publication_title,isbn,issn,doi,url,...,programming_language,version,system,code,code_number,section,session,committee,history,legislative_body
2,ZNHCCZDX,journalArticle,1995.0,"Andrews, Robert; Diederich, Joachim; Tickle, A...",Survey and critique of techniques for extracti...,Knowledge-Based Systems,,0950-7051,10.1016/0950-7051(96)81920-4,https://www.sciencedirect.com/science/article/...,...,,,,,,,,,,
4,9A7JYJD9,conferencePaper,2014.0,"Zeiler, Matthew D.; Fergus, Rob",Visualizing and Understanding Convolutional Ne...,Computer Vision – ECCV 2014,978-3-319-10590-1,,10.1007/978-3-319-10590-1_53,,...,,,,,,,,,,
5,ZJIAEQWK,journalArticle,2022.0,"Zhao, Zhenge; Xu, Panpan; Scheidegger, Carlos;...",Human-in-the-loop Extraction of Interpretable ...,IEEE Transactions on Visualization and Compute...,,1941-0506,10.1109/TVCG.2021.3114837,,...,,,,,,,,,,
8,PSW86ZBQ,journalArticle,2019.0,"Spinner, Thilo; Schlegel, Udo; Schäfer, Hanna;...",explAIner: A Visual Analytics Framework for In...,IEEE Transactions on Visualization and Compute...,,"1077-2626, 1941-0506, 2160-9306",10.1109/TVCG.2019.2934629,http://arxiv.org/abs/1908.00087,...,,,,,,,,,,
10,JQDL5MJY,journalArticle,,"Mueller, Shane T; Hoffman, Robert R; Clancey, ...",Explanation in Human-AI Systems: A Literature ...,,,,,,...,,,,,,,,,,


In [12]:
select_column_data = filtered_data[['item_type',
                                                   'publication_year',
                                                   'author',
                                                   'title',
                                                   'issn',
                                                   'doi',
                                                   'url',
                                                   'abstract_note',
                                                   'date',
                                                   'date_added',
                                                   'pages',
                                                   'num_pages',
                                                   'issue',
                                                   'volume',
                                                   'publisher',
                                                   'library_catalog',
                                                   'extra',
                                                   'file_attachments',
                                                   'conference_name',
                                                   'manual_tags',
                                                   'automatic_tags',
                                                   'editor',
                                                   'book_author',
                                                   ]]

In [13]:
select_column_data.columns

Index(['item_type', 'publication_year', 'author', 'title', 'issn', 'doi',
       'url', 'abstract_note', 'date', 'date_added', 'pages', 'num_pages',
       'issue', 'volume', 'publisher', 'library_catalog', 'extra',
       'file_attachments', 'conference_name', 'manual_tags', 'automatic_tags',
       'editor', 'book_author'],
      dtype='object')

In [14]:
select_column_data['publication_year'].unique()

array([1995., 2014., 2022., 2019.,   nan, 2018., 2021., 2023., 2010.,
       2017., 1994., 2013., 1990., 2020., 2011., 2004., 1953., 2016.,
       2001., 1989., 2003., 2006., 2015., 2008., 2002., 2007., 2009.,
       2012., 1999., 2000., 1993., 1974., 1988.])

In [15]:
select_column_data['publication_year'].fillna(0,inplace=True)
select_column_data['publication_year'].unique()


array([1995., 2014., 2022., 2019.,    0., 2018., 2021., 2023., 2010.,
       2017., 1994., 2013., 1990., 2020., 2011., 2004., 1953., 2016.,
       2001., 1989., 2003., 2006., 2015., 2008., 2002., 2007., 2009.,
       2012., 1999., 2000., 1993., 1974., 1988.])

In [16]:
select_column_data[['publication_year','num_pages']].fillna(0,inplace=True)
select_column_data.astype({'publication_year':'int32'                  
                      })

Unnamed: 0,item_type,publication_year,author,title,issn,doi,url,abstract_note,date,date_added,...,volume,publisher,library_catalog,extra,file_attachments,conference_name,manual_tags,automatic_tags,editor,book_author
2,journalArticle,1995,"Andrews, Robert; Diederich, Joachim; Tickle, A...",Survey and critique of techniques for extracti...,0950-7051,10.1016/0950-7051(96)81920-4,https://www.sciencedirect.com/science/article/...,"It is becoming increasingly apparent that, wit...",1995-12-01,2023-04-07 22:31:38,...,8,,ScienceDirect,726 citations (Crossref) [2023-04-08],C:\Users\ambreen.hanif\Zotero\storage\B49K6PUG...,,notion,,,
4,conferencePaper,2014,"Zeiler, Matthew D.; Fergus, Rob",Visualizing and Understanding Convolutional Ne...,,10.1007/978-3-319-10590-1_53,,Large Convolutional Network models have recent...,2014,2023-04-10 23:20:20,...,,Springer International Publishing,Springer Link,3991 citations (Crossref) [2023-04-11],; C:\Users\ambreen.hanif\Zotero\storage\3E8L2I...,,notion,,"Fleet, David; Pajdla, Tomas; Schiele, Bernt; T...",
5,journalArticle,2022,"Zhao, Zhenge; Xu, Panpan; Scheidegger, Carlos;...",Human-in-the-loop Extraction of Interpretable ...,1941-0506,10.1109/TVCG.2021.3114837,,The interpretation of deep neural networks (DN...,2022-01,2023-05-22 12:50:20,...,28,,IEEE Xplore,8 citations (Crossref) [2023-05-23] Conference...,C:\Users\ambreen.hanif\Zotero\storage\CUI37AB4...,,notion,Analytical models; Computational modeling; Dat...,,
8,journalArticle,2019,"Spinner, Thilo; Schlegel, Udo; Schäfer, Hanna;...",explAIner: A Visual Analytics Framework for In...,"1077-2626, 1941-0506, 2160-9306",10.1109/TVCG.2019.2934629,http://arxiv.org/abs/1908.00087,We propose a framework for interactive and exp...,2019,2023-04-04 00:00:29,...,,,arXiv.org,51 citations (Crossref) [2023-04-04] arXiv:190...,C:\Users\ambreen.hanif\Zotero\storage\M6MRIGJ4...,,notion,,,
10,journalArticle,0,"Mueller, Shane T; Hoffman, Robert R; Clancey, ...",Explanation in Human-AI Systems: A Literature ...,,,,,,2023-04-06 20:14:14,...,,,Zotero,,C:\Users\ambreen.hanif\Zotero\storage\Y829FWPV...,,notion,⛔ No DOI found,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,journalArticle,0,"Touvron, Hugo; Martin, Louis; Stone, Kevin",Llama 2: Open Foundation and Fine-Tuned Chat M...,,,,"In this work, we develop and release Llama 2, ...",,2023-07-19 05:48:26,...,,,Zotero,,C:\Users\ambreen.hanif\Zotero\storage\2YRT47YB...,,,,,
942,journalArticle,2009,"Kostakos, Vassilis",Temporal graphs,03784371,10.1016/j.physa.2008.11.021,https://linkinghub.elsevier.com/retrieve/pii/S...,"We introduce the idea of temporal graphs, a re...",2009-03,2023-07-20 04:47:44,...,388,,DOI.org (Crossref),,C:\Users\ambreen.hanif\Zotero\storage\T6T2GU3C...,,,,,
943,conferencePaper,2022,"Brugman, Simon; Sostak, Tomas; Patil, Pradyot;...",popmon: Analysis Package for Dataset Shift Det...,,10.25080/majora-212e5952-01d,https://conference.scipy.org/proceedings/scipy...,,2022,2023-07-20 23:58:39,...,,,DOI.org (Crossref),,C:\Users\ambreen.hanif\Zotero\storage\E9RL6LFY...,Python in Science Conference,,,,
944,conferencePaper,2020,"Yenicelik, David; Schmidt, Florian; Kilcher, Y...",How does BERT capture semantics? A closer look...,,10.18653/v1/2020.blackboxnlp-1.15,https://aclanthology.org/2020.blackboxnlp-1.15,The recent paradigm shift to contextual word e...,2020-11,2023-07-21 00:34:24,...,,Association for Computational Linguistics,ACLWeb,,C:\Users\ambreen.hanif\Zotero\storage\YTDQW98P...,BlackboxNLP 2020,,,,


In [17]:
select_column_data['first_author'] = select_column_data['author'].str.split(';').str[0]
select_column_data.head(5)

Unnamed: 0,item_type,publication_year,author,title,issn,doi,url,abstract_note,date,date_added,...,publisher,library_catalog,extra,file_attachments,conference_name,manual_tags,automatic_tags,editor,book_author,first_author
2,journalArticle,1995.0,"Andrews, Robert; Diederich, Joachim; Tickle, A...",Survey and critique of techniques for extracti...,0950-7051,10.1016/0950-7051(96)81920-4,https://www.sciencedirect.com/science/article/...,"It is becoming increasingly apparent that, wit...",1995-12-01,2023-04-07 22:31:38,...,,ScienceDirect,726 citations (Crossref) [2023-04-08],C:\Users\ambreen.hanif\Zotero\storage\B49K6PUG...,,notion,,,,"Andrews, Robert"
4,conferencePaper,2014.0,"Zeiler, Matthew D.; Fergus, Rob",Visualizing and Understanding Convolutional Ne...,,10.1007/978-3-319-10590-1_53,,Large Convolutional Network models have recent...,2014,2023-04-10 23:20:20,...,Springer International Publishing,Springer Link,3991 citations (Crossref) [2023-04-11],; C:\Users\ambreen.hanif\Zotero\storage\3E8L2I...,,notion,,"Fleet, David; Pajdla, Tomas; Schiele, Bernt; T...",,"Zeiler, Matthew D."
5,journalArticle,2022.0,"Zhao, Zhenge; Xu, Panpan; Scheidegger, Carlos;...",Human-in-the-loop Extraction of Interpretable ...,1941-0506,10.1109/TVCG.2021.3114837,,The interpretation of deep neural networks (DN...,2022-01,2023-05-22 12:50:20,...,,IEEE Xplore,8 citations (Crossref) [2023-05-23] Conference...,C:\Users\ambreen.hanif\Zotero\storage\CUI37AB4...,,notion,Analytical models; Computational modeling; Dat...,,,"Zhao, Zhenge"
8,journalArticle,2019.0,"Spinner, Thilo; Schlegel, Udo; Schäfer, Hanna;...",explAIner: A Visual Analytics Framework for In...,"1077-2626, 1941-0506, 2160-9306",10.1109/TVCG.2019.2934629,http://arxiv.org/abs/1908.00087,We propose a framework for interactive and exp...,2019,2023-04-04 00:00:29,...,,arXiv.org,51 citations (Crossref) [2023-04-04] arXiv:190...,C:\Users\ambreen.hanif\Zotero\storage\M6MRIGJ4...,,notion,,,,"Spinner, Thilo"
10,journalArticle,0.0,"Mueller, Shane T; Hoffman, Robert R; Clancey, ...",Explanation in Human-AI Systems: A Literature ...,,,,,,2023-04-06 20:14:14,...,,Zotero,,C:\Users\ambreen.hanif\Zotero\storage\Y829FWPV...,,notion,⛔ No DOI found,,,"Mueller, Shane T"


In [18]:
label_encoder = LabelEncoder()
select_column_data['item_type']= label_encoder.fit_transform(select_column_data['item_type'])
  
select_column_data['item_type'].unique()

array([2, 1, 0, 3])

In [19]:
select_column_data['publication_year'].unique()


array([1995., 2014., 2022., 2019.,    0., 2018., 2021., 2023., 2010.,
       2017., 1994., 2013., 1990., 2020., 2011., 2004., 1953., 2016.,
       2001., 1989., 2003., 2006., 2015., 2008., 2002., 2007., 2009.,
       2012., 1999., 2000., 1993., 1974., 1988.])

In [20]:
select_column_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 2 to 945
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   item_type         596 non-null    int32  
 1   publication_year  596 non-null    float64
 2   author            581 non-null    object 
 3   title             595 non-null    object 
 4   issn              211 non-null    object 
 5   doi               408 non-null    object 
 6   url               335 non-null    object 
 7   abstract_note     504 non-null    object 
 8   date              523 non-null    object 
 9   date_added        596 non-null    object 
 10  pages             419 non-null    object 
 11  num_pages         10 non-null     object 
 12  issue             162 non-null    object 
 13  volume            274 non-null    object 
 14  publisher         101 non-null    object 
 15  library_catalog   346 non-null    object 
 16  extra             439 non-null    object 
 17  fi

In [21]:
select_column_data['manual_tags'].unique()

array(['notion', nan, 'Financial Risk', 'Researcher App',
       'notion; read; xai; reinforcement learning; survey',
       'notion; XAI; survey; GNN', 'notion; Interpretable Features',
       'survey', 'notion; XAI', 'notion; first_pass',
       'notion; Explainable; knowledge graphs; crowdgraph; fauxtography',
       'notion; Knowledge Lake; Service', 'survey; first-pass',
       'firstpass'], dtype=object)

In [22]:
_filtered_data = filtered_data.dropna(axis=1)
_filtered_data.shape

(596, 4)

Seperate Numeric Columns

In [23]:
select_column_data.head(5)

Unnamed: 0,item_type,publication_year,author,title,issn,doi,url,abstract_note,date,date_added,...,publisher,library_catalog,extra,file_attachments,conference_name,manual_tags,automatic_tags,editor,book_author,first_author
2,2,1995.0,"Andrews, Robert; Diederich, Joachim; Tickle, A...",Survey and critique of techniques for extracti...,0950-7051,10.1016/0950-7051(96)81920-4,https://www.sciencedirect.com/science/article/...,"It is becoming increasingly apparent that, wit...",1995-12-01,2023-04-07 22:31:38,...,,ScienceDirect,726 citations (Crossref) [2023-04-08],C:\Users\ambreen.hanif\Zotero\storage\B49K6PUG...,,notion,,,,"Andrews, Robert"
4,1,2014.0,"Zeiler, Matthew D.; Fergus, Rob",Visualizing and Understanding Convolutional Ne...,,10.1007/978-3-319-10590-1_53,,Large Convolutional Network models have recent...,2014,2023-04-10 23:20:20,...,Springer International Publishing,Springer Link,3991 citations (Crossref) [2023-04-11],; C:\Users\ambreen.hanif\Zotero\storage\3E8L2I...,,notion,,"Fleet, David; Pajdla, Tomas; Schiele, Bernt; T...",,"Zeiler, Matthew D."
5,2,2022.0,"Zhao, Zhenge; Xu, Panpan; Scheidegger, Carlos;...",Human-in-the-loop Extraction of Interpretable ...,1941-0506,10.1109/TVCG.2021.3114837,,The interpretation of deep neural networks (DN...,2022-01,2023-05-22 12:50:20,...,,IEEE Xplore,8 citations (Crossref) [2023-05-23] Conference...,C:\Users\ambreen.hanif\Zotero\storage\CUI37AB4...,,notion,Analytical models; Computational modeling; Dat...,,,"Zhao, Zhenge"
8,2,2019.0,"Spinner, Thilo; Schlegel, Udo; Schäfer, Hanna;...",explAIner: A Visual Analytics Framework for In...,"1077-2626, 1941-0506, 2160-9306",10.1109/TVCG.2019.2934629,http://arxiv.org/abs/1908.00087,We propose a framework for interactive and exp...,2019,2023-04-04 00:00:29,...,,arXiv.org,51 citations (Crossref) [2023-04-04] arXiv:190...,C:\Users\ambreen.hanif\Zotero\storage\M6MRIGJ4...,,notion,,,,"Spinner, Thilo"
10,2,0.0,"Mueller, Shane T; Hoffman, Robert R; Clancey, ...",Explanation in Human-AI Systems: A Literature ...,,,,,,2023-04-06 20:14:14,...,,Zotero,,C:\Users\ambreen.hanif\Zotero\storage\Y829FWPV...,,notion,⛔ No DOI found,,,"Mueller, Shane T"


In [24]:
numerics = ['int16', 'int32', 'int64']
numeric_data = select_column_data.select_dtypes(include=numerics)
print( "Numeric column in input DataFrame is:\n", numeric_data.columns)

Numeric column in input DataFrame is:
 Index(['item_type'], dtype='object')


In [25]:
select_column_data['text_info']= select_column_data['title']+" "+ select_column_data['abstract_note']
select_column_data['text_info'].head(5)

2     Survey and critique of techniques for extracti...
4     Visualizing and Understanding Convolutional Ne...
5     Human-in-the-loop Extraction of Interpretable ...
8     explAIner: A Visual Analytics Framework for In...
10                                                  NaN
Name: text_info, dtype: object

In [35]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



In [34]:
select_column_data = select_column_data[select_column_data['text_info'].notna()]
select_column_data['text_info'].head(5)

2     Survey and critique of techniques for extracti...
4     Visualizing and Understanding Convolutional Ne...
5     Human-in-the-loop Extraction of Interpretable ...
8     explAIner: A Visual Analytics Framework for In...
15    Interpretable Machine Learning in Healthcare T...
Name: text_info, dtype: object

In [54]:
print(select_column_data.size)

12600


In [47]:

# Sentences we want sentence embeddings for
#sentences = select_column_data['text_info'].to_numpy()
sentences = [str(i) for i in select_column_data['text_info'].values]
print(sentences)

['Survey and critique of techniques for extracting rules from trained artificial neural networks It is becoming increasingly apparent that, without some form of explanation capability, the full potential of trained artificial neural networks (ANNs) may not be realised. This survey gives an overview of techniques developed to redress this situation. Specifically, the survey focuses on mechanisms, procedures, and algorithms designed to insert knowledge into ANNs (knowledge initialisation), extract rules from trained ANNs (rule extraction), and utilise ANNs to refine existing rule bases (rule refinement). The survey also introduces a new taxonomy for classifying the various techniques, discusses their modus operandi, and delineates criteria for evaluating their efficacy.', 'Visualizing and Understanding Convolutional Networks Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark Krizhevsky et al. [18]. However there i

In [41]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

Downloading (…)okenizer_config.json: 100%|██████████| 352/352 [00:00<00:00, 352kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.66MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.55MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 573/573 [00:00<00:00, 574kB/s]
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:36<00:00, 3.69MB/s] 


In [48]:
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')


In [49]:

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)


RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 6341787648 bytes.

In [None]:
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

In [33]:
# create embeddings
#sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embeddings = sentence_model.encode(select_column_data.text_info, show_progress_bar=True)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


KeyError: 68

# Input Tokenization using Bert


In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



In [29]:
select_column_data = select_column_data[select_column_data['title'].notna()]

In [None]:
# title embeddings 

def addMarks(x):
    return '[CLS] ' + x + ' [SEP]'

select_column_data['marked_title'] = select_column_data['title'].apply(lambda x:addMarks(x))
select_column_data['title_tokens'] = select_column_data['marked_title'].apply(lambda x:tokenizer.tokenize(x))
# Map the token strings to their vocabulary indeces.
select_column_data['indexed_tokens']= select_column_data['title_tokens'].apply(lambda x:tokenizer.convert_tokens_to_ids(x),axis=1)

# # Display the words with their indeces.
# for tup in zip(title, indexed_tokens):
#     print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [None]:
# create embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(tx_pandas.TransactionInfo, show_progress_bar=True)

In [30]:
def segment(x):
    return len(x) * 1

select_column_data['segment_id'] = select_column_data['title_tokens'].apply(lambda x:segment(x))

KeyError: 'title_tokens'

Running Bert on Text 

In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor(['indexed_tokens'])
segments_tensors = torch.tensor(['segment_id'])



ValueError: too many dimensions 'str'

In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

NameError: name 'tokenized_text' is not defined

In [None]:
select_column_data['title_tokens'].head(5)


2     [Survey, and, critique, of, techniques, for, e...
4     [Visual, ##izing, and, Understanding, Con, ##v...
5     [Human, -, in, -, the, -, loop, Extra, ##ction...
8     [ex, ##p, ##l, ##A, ##I, ##ner, :, A, Visual, ...
10    [Ex, ##p, ##lana, ##tion, in, Human, -, AI, Sy...
Name: title_tokens, dtype: object

In [None]:
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


## References