In [None]:
!pip install transformers torch pandas tqdm



In [2]:
import pandas as pd
import torch
from transformers import pipeline
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("../data/constitution_chunks.csv")
df.head()

Unnamed: 0,sentence
0,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST..."
1,"In this edition, the text of the Constitution ..."
2,The foot notes below the text indicate the Con...
3,The Constitution (One Hundredth Amendment) Act...
4,The Constitution (Application to Jammu and Kas...


NER pipeline (HuggingFace)

In [4]:
device = 0 if torch.cuda.is_available() else -1

ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    tokenizer="dslim/bert-base-NER",
    aggregation_strategy="simple",
    device=device
)





Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [5]:
#Run NER on Constitution sentences (batched)
ner_results = []

for sentence in tqdm(df["sentence"], desc="Running NER"):
    try:
        entities = ner_pipeline(sentence)
        for ent in entities:
            ner_results.append({
                "sentence": sentence,
                "entity": ent["word"],
                "label": ent["entity_group"],
                "score": ent["score"]
            })
    except Exception:
        continue

    #robust to rare tokenizer errors
#safe for long legal sentences


Running NER: 100%|██████████| 3766/3766 [11:09<00:00,  5.62it/s]  


In [7]:

ner_df = pd.DataFrame(ner_results)
ner_df.head(10)

##we convert ner out to df

Unnamed: 0,sentence,entity,label,score
0,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",IN,LOC,0.998268
1,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",##DIA,LOC,0.81135
2,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",IN,LOC,0.997714
3,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",##DIA,LOC,0.771135
4,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",Constitution of India,MISC,0.902658
5,"In this edition, the text of the Constitution ...",Constitution of India,MISC,0.991984
6,"In this edition, the text of the Constitution ...",Constitution,MISC,0.988512
7,"In this edition, the text of the Constitution ...",One Hundred and Sixth Amendment,MISC,0.908322
8,"In this edition, the text of the Constitution ...",Act,MISC,0.933118
9,The foot notes below the text indicate the Con...,Constitution Amendment Acts,MISC,0.996675


In [8]:
ner_df.to_csv("../data/constitution_ner.csv", index=False)

#save to csv


In [9]:
ner_df["label"].value_counts()
ner_df.head()

Unnamed: 0,sentence,entity,label,score
0,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",IN,LOC,0.998268
1,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",##DIA,LOC,0.81135
2,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",IN,LOC,0.997714
3,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",##DIA,LOC,0.771135
4,"£ÉÉ®iÉ BÉEÉ ºÉÆÉÊ´ÉvÉÉxÉ [1 , 2024 ] THE CONST...",Constitution of India,MISC,0.902658


In [10]:
(
    ner_df.groupby(["entity", "label"])
    .size()
    .sort_values(ascending=False)
    .head(15)
)


entity               label
Act                  MISC     803
Constitution         MISC     733
INDIA                LOC      322
Legislature          ORG      256
India                LOC      254
House                ORG      199
Union                ORG      190
Supreme Court        ORG      182
Schedule             MISC     139
House of the People  ORG      127
High Court           ORG      123
Seventh Amendment    MISC     115
State                LOC      113
States               LOC      109
Union                LOC      100
dtype: int64

In [11]:
sample = ner_df.sample(10, random_state=42)
sample

Unnamed: 0,sentence,entity,label,score
1112,"Vacation and resignation of, and removal from,...",House of the People,ORG,0.998629
1683,Transfer of certain cases.—2[(1) Where cases i...,High Courts,ORG,0.874777
4342,(2) Two or more States may agree that there sh...,Public Service Commission,ORG,0.998824
1498,(7) No person who has held office as a Judge o...,Supreme Court,ORG,0.991557
2588,"The bracket and figure ""(1)"" omitted by the Co...",Constitution,MISC,0.967967
5425,(3) Every Proclamation under this article shal...,House of the People,ORG,0.998557
5905,(2) If the concurrence of the Government of th...,State,ORG,0.633074
1479,"Union of Indiain itsjudgment dated 16-10-2015,...",Union of India,ORG,0.998853
8118,Paragraph 20BB has been inserted in its applic...,Tripura,LOC,0.998595
3272,Part not to apply to certain areas.—(1) Nothin...,Scheduled Areas,MISC,0.926576


In [12]:
sample = ner_df.sample(10, random_state=42)
sample
for _, row in sample.iterrows():
    print(f"Sentence: {row['sentence']}")
    print(f"Entity: {row['entity']}, Label: {row['label']}, Score: {row['score']:.4f}")
    print("-" * 50)

Sentence: Vacation and resignation of, and removal from, the offices of Speaker and Deputy Speaker.—A member holding office as Speaker or Deputy Speaker of the House of the People— (a)shall vacate his office if he ceases to be a member of the House of the People; (b)may at any time, by writing under his hand addressed, if such member is the Speaker, to the Deputy Speaker, and if such member is the Deputy Speaker, to the Speaker, resign his office; and (c)may be removed from his office by a resolution of the House of the People passed by a majority of all the then members of the House: Provided that no resolution for the purpose of clause (c)shall be moved unless at least fourteen days’ notice has been given of the intention to move the resolution: Provided further that, whenever the House of the People is dissolved, the Speaker shall not vacate his office until immediately before the first meeting of the House of the People after the dissolution.
Entity: House of the People, Label: ORG

Constitution emphasizes governance structures.

Certain institutions (Parliament, Supreme Court) dominate, indicating institutional focus over individuals.