### *Survey Paper:* What is the Current Research on the Ethic of AI

#### Step 1: Pull Data From:
 * https://aclanthology.org/
 * https://www.institutional.org/institutional-books

In [32]:
from acl_anthology import Anthology
anthology = Anthology.from_repo()



#### Step 2: Analysis of NLP Papers

##### Step 2 Part A: Load In Specter 2 Model

* https://huggingface.co/allenai/specter2

In [33]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
from sklearn.metrics.pairwise import euclidean_distances
from typing import List

class Specter2:
    def __init__(self, model_name='allenai/specter2_base'):
        # Load model and tokenizer upon initialization
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoAdapterModel.from_pretrained(model_name)

    def embed_input(self, text_batch: List[str]):
        # Preprocess the input and compute embeddings
        inputs = self.tokenizer(text_batch, padding=True, truncation=True,
                                return_tensors="pt", return_token_type_ids=False, max_length=512)
        output = self.model(**inputs)
        embeddings = output.last_hidden_state[:, 0, :]
        return embeddings

##### Step 2 Part B: Encode Query

In [34]:
specter2 = Specter2()
#load the query adapter, provide an identifier for the adapter in load_as argument and activate it
specter2.model.load_adapter("allenai/specter2_adhoc_query", source="hf", load_as="specter2_adhoc_query", set_active=True)
query = ["Bidirectional transformers"]
query_embedding = specter2.embed_input(query)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 29279.61it/s]
There are adapters available but none are activated for the forward pass.


##### Step 2 Part C: Encode Papers

In [1]:
from itertools import islice
#load the proximity adapter, provide an identifier for the adapter in load_as argument and activate it
specter2.model.load_adapter("allenai/specter2", source="hf", load_as="specter2_proximity", set_active=True)

# pull papers from anthology
papers = anthology.papers()

# convert papers into structured inputs -> `{Title}: {Abstract}`
text_papers_batch = [(str(d.title) or "") + specter2.tokenizer.sep_token + (str(d.abstract) or "") for d in papers]
paper_embeddings = specter2.embed_input(text_papers_batch)

# Calculate L2 distance between query and papers
l2_distance = euclidean_distances(paper_embeddings, query_embedding).flatten()

NameError: name 'specter2' is not defined

##### Step 2 Part D: Analyze and Graph Clusters