In [1]:
import pandas as pd

In [7]:
df = pd.read_csv('docs_with_text.csv')

In [9]:
df = df.loc[df['repo'] == 'bluexp-automation-main']

In [10]:
df.head()

Unnamed: 0,doc_id,file_path,repo,title,headings,raw_text,summary,keywords,permalink
3540,aa_concepts,./bluexp-automation-main/platform/aa_concepts....,bluexp-automation-main,Basic concepts and terminology,"['OAuth 2.0 authorization framework', 'OpenID ...",sidebar: sidebar You should be familiar with t...,You should be familiar with the basic authenti...,"['bluexp', 'authorization', 'authentication', ...",platform/aa_concepts.html
3541,create_user_token,./bluexp-automation-main/platform/create_user_...,bluexp-automation-main,Create user token,['Create a user token with federated authentic...,sidebar: sidebar You need to generate a bearer...,Create a BlueXP user token by using this workflow,"['generate', 'user', 'token', 'bluexp', 'rest'...",platform/create_user_token.html
3542,http_details,./bluexp-automation-main/platform/http_details...,bluexp-automation-main,HTTP details,"['Request', 'HTTP methods', 'Request headers',...",sidebar: sidebar BlueXP REST APIs are based on...,The BlueXP REST API is based on the HTTP proto...,"['http', 'details', 'bluexp', 'rest', 'api', '...",platform/http_details.html
3543,get_identifiers,./bluexp-automation-main/platform/get_identifi...,bluexp-automation-main,Get required identifiers,"['Get the Connector identifier', 'Get the acco...",sidebar: sidebar You can access the BlueXP web...,You can sign into the BlueXP web user interfac...,"['client', 'id', 'account', 'identifiers', 'cl...",platform/get_identifiers.html
3544,workflows_tasks,./bluexp-automation-main/platform/workflows_ta...,bluexp-automation-main,Workflow processes and tasks,"['Workflows', 'Base URLs and REST endpoint pat...",sidebar: sidebar The BlueXP REST APIs support ...,The BlueXP platform REST APIs support many dif...,"['workflows', 'processes', 'bluexp', 'api', 'c...",platform/workflows_tasks.html


### Basic named entity recognition with spaCy

In [12]:
import spacy

# Load the model (only once)
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Optional: target only certain labels
target_labels = {"ORG", "PRODUCT", "LOC", "GPE"}

def extract_named_entities(text, labels=target_labels):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in labels]

In [16]:
df["named_entities"] = df["raw_text"].apply(extract_named_entities)

In [18]:
from collections import Counter

def is_valid_entity(text):
    return (
        len(text) < 40 and
        not text.isupper() and
        not text.startswith("TA") and
        not text.startswith("eyJ") and
        not any(c in text for c in "=./")
    )

def count_named_entities(entities):
    # Filter + count
    filtered = [(t.strip(), l) for t, l in entities if is_valid_entity(t)]
    counter = Counter(filtered)
    return [(text, label, count) for (text, label), count in counter.items()]

df["named_entities_with_counts"] = df["named_entities"].apply(count_named_entities)


In [20]:
for idx, row in df.iterrows():
    title = row.get("filename", f"Document {idx}")
    ents = row["named_entities_with_counts"]  # now contains (text, label, count)
    
    print(title)
    for text, label, count in ents:
        print(f"  - {text} ({label}) x{count}")
    print()  # blank line between docs


Document 3540
  - OpenID (ORG) x1
  - NetApp (ORG) x5
  - Tokens (ORG) x1

Document 3541
  - NetApp (ORG) x2
  - Token (ORG) x1

Document 3542
  - |`202` |Accepted |The request (ORG) x1
  - |`204` |No Content (ORG) x1
  - BlueXP (GPE) x1

Document 3543
  - NetApp (ORG) x2
  - suffix (GPE) x1

Document 3544
  - NetApp (ORG) x1
  - OAuth2 (ORG) x1

Document 3545
  - Federated (ORG) x2
  - |Contains (ORG) x1
  - NetApp (ORG) x1

Document 3546
  - Representational State Transfer (ORG) x1
  - Connectors (ORG) x1
  - BlueXP (GPE) x1

Document 3547
  - NetApp (ORG) x2
  - UI The BlueXP (ORG) x1

Document 3548
  - NetApp Cloud (PRODUCT) x1
  - Select *Create Service Account (ORG) x1

Document 3549
  - the API Explorer (PRODUCT) x1
  - NetApp (ORG) x4
  - Token (ORG) x1
  - Tenancy (ORG) x1
  - API Explorer (ORG) x2
  - NetApp Support Site (PRODUCT) x1

Document 3550
  - Registering (PRODUCT) x1
  - Microsoft (ORG) x1
  - NetApp (ORG) x1
  - Microsoft Azure Active Directory (ORG) x1

Document 3

### Try with model "en_core_web_trf"

In [21]:
nlp_trf = spacy.load("en_core_web_trf")

In [22]:
def extract_named_entities_trf(text, labels=target_labels):
    doc = nlp_trf(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in labels]

df["named_entities_trf"] = df["raw_text"].apply(extract_named_entities_trf)


In [23]:
df["named_entities_with_counts_trf"] = df["named_entities_trf"].apply(count_named_entities)


In [24]:
for idx, row in df.iterrows():
    title = row.get("filename", f"Document {idx}")
    ents = row["named_entities_with_counts_trf"]  
    
    print(title)
    for text, label, count in ents:
        print(f"  - {text} ({label}) x{count}")
    print()  # blank line between docs


Document 3540
  - OpenID Connect (PRODUCT) x2
  - Auth0 (PRODUCT) x5
  - NetApp Cloud (ORG) x2
  - NetApp (ORG) x2
  - NetApp BlueXP (ORG) x1
  - BlueXP (PRODUCT) x1

Document 3541
  - NetApp (ORG) x2

Document 3542
  - BlueXP (PRODUCT) x4
  - Cloud Volumes (PRODUCT) x1

Document 3543
  - BlueXP (PRODUCT) x2
  - Cloud Volumes ONTAP REST API (PRODUCT) x1
  - Connector (PRODUCT) x2
  - BlueXP (ORG) x10
  - NetApp (ORG) x2

Document 3544
  - BlueXP (ORG) x1
  - NetApp (ORG) x1
  - *NetApp BlueXP* service (ORG) x1

Document 3545

Document 3546
  - Cloud Volumes ONTAP (PRODUCT) x1
  - Azure NetApp Files (PRODUCT) x1
  - BlueXP (PRODUCT) x3

Document 3547
  - BlueXP (PRODUCT) x6
  - NetApp (ORG) x2

Document 3548
  - BlueXP (ORG) x5

Document 3549
  - The API Explorer (PRODUCT) x1
  - BlueXP (PRODUCT) x3
  - the API Explorer (PRODUCT) x4
  - NetApp (ORG) x1
  - BlueXP (ORG) x1
  - Tenancy (ORG) x1
  - API Explorer (PRODUCT) x1
  - NetApp BlueXP (ORG) x2
  - NetApp Cloud (ORG) x1
  - NetApp S

In [26]:
df["filename"] = df["file_path"].str.replace(r'^\./', '', regex=True)
df.head()


Unnamed: 0,doc_id,file_path,repo,title,headings,raw_text,summary,keywords,permalink,named_entities,named_entities_with_counts,named_entities_trf,named_entities_with_counts_trf,filename
3540,aa_concepts,./bluexp-automation-main/platform/aa_concepts....,bluexp-automation-main,Basic concepts and terminology,"['OAuth 2.0 authorization framework', 'OpenID ...",sidebar: sidebar You should be familiar with t...,You should be familiar with the basic authenti...,"['bluexp', 'authorization', 'authentication', ...",platform/aa_concepts.html,"[(OpenID, ORG), (NetApp, ORG), (NetApp, ORG), ...","[(OpenID, ORG, 1), (NetApp, ORG, 5), (Tokens, ...","[(OpenID Connect, PRODUCT), (Auth0, PRODUCT), ...","[(OpenID Connect, PRODUCT, 2), (Auth0, PRODUCT...",bluexp-automation-main/platform/aa_concepts.adoc
3541,create_user_token,./bluexp-automation-main/platform/create_user_...,bluexp-automation-main,Create user token,['Create a user token with federated authentic...,sidebar: sidebar You need to generate a bearer...,Create a BlueXP user token by using this workflow,"['generate', 'user', 'token', 'bluexp', 'rest'...",platform/create_user_token.html,"[(NetApp, ORG), (Token, ORG), (API, ORG), (Net...","[(NetApp, ORG, 2), (Token, ORG, 1)]","[(NetApp, ORG), (NetApp, ORG)]","[(NetApp, ORG, 2)]",bluexp-automation-main/platform/create_user_to...
3542,http_details,./bluexp-automation-main/platform/http_details...,bluexp-automation-main,HTTP details,"['Request', 'HTTP methods', 'Request headers',...",sidebar: sidebar BlueXP REST APIs are based on...,The BlueXP REST API is based on the HTTP proto...,"['http', 'details', 'bluexp', 'rest', 'api', '...",platform/http_details.html,"[(API, ORG), (JSON, GPE), (|`202` |Accepted |T...","[(|`202` |Accepted |The request, ORG, 1), (|`2...","[(BlueXP, PRODUCT), (BlueXP, PRODUCT), (Cloud ...","[(BlueXP, PRODUCT, 4), (Cloud Volumes, PRODUCT...",bluexp-automation-main/platform/http_details.adoc
3543,get_identifiers,./bluexp-automation-main/platform/get_identifi...,bluexp-automation-main,Get required identifiers,"['Get the Connector identifier', 'Get the acco...",sidebar: sidebar You can access the BlueXP web...,You can sign into the BlueXP web user interfac...,"['client', 'id', 'account', 'identifiers', 'cl...",platform/get_identifiers.html,"[(UI, ORG), (NetApp, ORG), (suffix, GPE), (Net...","[(NetApp, ORG, 2), (suffix, GPE, 1)]","[(BlueXP, PRODUCT), (BlueXP, PRODUCT), (Cloud ...","[(BlueXP, PRODUCT, 2), (Cloud Volumes ONTAP RE...",bluexp-automation-main/platform/get_identifier...
3544,workflows_tasks,./bluexp-automation-main/platform/workflows_ta...,bluexp-automation-main,Workflow processes and tasks,"['Workflows', 'Base URLs and REST endpoint pat...",sidebar: sidebar The BlueXP REST APIs support ...,The BlueXP platform REST APIs support many dif...,"['workflows', 'processes', 'bluexp', 'api', 'c...",platform/workflows_tasks.html,"[(API, ORG), (ONTAP, ORG), (API, ORG), (NetApp...","[(NetApp, ORG, 1), (OAuth2, ORG, 1)]","[(BlueXP, ORG), (NetApp, ORG), (*NetApp BlueXP...","[(BlueXP, ORG, 1), (NetApp, ORG, 1), (*NetApp ...",bluexp-automation-main/platform/workflows_task...


In [37]:
df.iloc[3]["named_entities_with_counts_trf"]



[('BlueXP', 'PRODUCT', 2),
 ('Cloud Volumes ONTAP REST API', 'PRODUCT', 1),
 ('Connector', 'PRODUCT', 2),
 ('BlueXP', 'ORG', 10),
 ('NetApp', 'ORG', 2)]

In [38]:
def format_entities(entities):
    return " | ".join(f"{text} ({label}) x{count}" for text, label, count in entities)

df["entity_summary"] = df["named_entities_with_counts_trf"].apply(format_entities)


In [40]:
df.iloc[3]['entity_summary']

'BlueXP (PRODUCT) x2 | Cloud Volumes ONTAP REST API (PRODUCT) x1 | Connector (PRODUCT) x2 | BlueXP (ORG) x10 | NetApp (ORG) x2'

In [41]:
df[["filename", "entity_summary"]].to_csv("entity_summary.csv", index=False)
