In [None]:
!pip install pypdf langgraph langchain langchain-google-genai geopy requests langchain_experimental



In [None]:
import os, getpass
def _set_env(var: str):
    # if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")
_set_env("GEMINI_KEY")

GEMINI_KEY: ··········


# Task 1

In [None]:
import pypdf

PDF_FILE = "ukpga_20250022_en.pdf"
OUTPUT_FILE = "pdf_text_output.txt"

reader = pypdf.PdfReader(PDF_FILE)
all_text = ""

for i, page in enumerate(reader.pages[:-1]):  # skip last page
    try:
        text = page.extract_text()
        if text:
            all_text += text + "\n\n"
        print(f"Extracted page {i+1}/{len(reader.pages)-1}")
    except Exception as e:
        print(f"Error on page {i+1}: {e}")
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write(all_text)

print("\nDone! Extracted text saved to:", OUTPUT_FILE)


Extracted page 1/20
Extracted page 2/20
Extracted page 3/20
Extracted page 4/20
Extracted page 5/20
Extracted page 6/20
Extracted page 7/20
Extracted page 8/20
Extracted page 9/20
Extracted page 10/20
Extracted page 11/20
Extracted page 12/20
Extracted page 13/20
Extracted page 14/20
Extracted page 15/20
Extracted page 16/20
Extracted page 17/20
Extracted page 18/20
Extracted page 19/20
Extracted page 20/20

Done! Extracted text saved to: pdf_text_output.txt


In [None]:
import re
cleaned_text = re.sub(r'^\s*\d+\s*$', '', all_text, flags=re.MULTILINE) #lines that contain only numbers
lines = cleaned_text.split('\n')
from collections import Counter
line_counts = Counter(lines)
repeated_lines = [line for line, count in line_counts.items() if count > 10 and line.strip() != '']
lines = [line for line in lines if line not in repeated_lines]
print(repeated_lines)
cleaned_text = '\n'.join(lines)
print(cleaned_text)

[]
Universal Credit Act 

CHAPTER 22 
Explanatory Notes have been produced to assist in the 
understanding of this Act and are available separately 
£8.90

Universal Credit Act 2025 
CHAPTER 22 
CONTENTS 
Universal credit 
1 Standard allowance for tax years 2026-27 to 2029-30 
2 LCWRA element for tax year 2026-27 
3 Freeze of LCWRA and LCW elements for tax years 2026-27 to 2029-30 
4 Protected LCWRA amount for tax years 2026-27 to 2029-30 
5 Legacy employment and support allowance payments 
Corresponding provision for Northern Ireland 
6 Corresponding provision for Northern Ireland 
Short title 
7 Short title 
Amendments to the Universal Credit Regulations 2013 in 
connection with new amounts of the LCWRA element 
Schedule 1 —  
Northern Ireland: corresponding provision Schedule 2 —  

c. 22 CHARLES III 
Universal Credit Act 2025 
2025 CHAPTER 22 
An Act to make provision to alter the rates of the standard allowance, 
limited capability for work element and limited capability for work 

In [None]:
file_path = "task1.txt"
with open(file_path, "w", encoding="utf-8") as f:
    f.write(cleaned_text)
print(f"Cleaned text saved to {file_path}")


Cleaned text saved to task1.txt


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model= "gemini-2.5-pro",
    temperature=1.0,
    max_retries=1,
    google_api_key=os.environ["GEMINI_KEY"],
)

In [70]:
prompt = f"""Give this in a structure format Make sure no information is lost.

{cleaned_text}
"""
response = llm.invoke(prompt)
print(response)

content='Here is the Universal Credit Act 2025 presented in a structured format, ensuring all information from the provided text is retained.\n\n***\n\n### **Act Identification**\n\n*   **Act Title:** Universal Credit Act 2025\n*   **Chapter:** 22\n*   **Reign:** CHARLES III\n*   **Explanatory Notes:** Available separately to assist in understanding the Act.\n*   **Price:** £8.90\n*   **Date of Royal Assent:** 3rd September 2025\n*   **Long Title:** An Act to make provision to alter the rates of the standard allowance, limited capability for work element and limited capability for work and work-related activity element of universal credit and the rates of income-related employment and support allowance.\n\n### **Enacting Formula**\n\nBE IT ENACTED by the King’s most Excellent Majesty, by and with the advice and consent of the Lords Spiritual and Temporal, and Commons, in this present Parliament assembled, and by the authority of the same, as follows:—\n\n---\n\n### **Table of Contents*

In [72]:
with open("structuredtask1.txt", "w", encoding="utf-8") as f:
    f.write(response.content)

# Task 2

In [None]:
prompt = f"""Summarize the entire Universal Credit Act 2025 in 5–10 bullet points, focusing on points like: Purpose, Key definitions, Eligibility. Obligations, Enforcement elements, etcetera

{cleaned_text}
"""
response = llm.invoke(prompt)

In [None]:
summary_text = response.content
print(summary_text)

Based on the provided text, here is a summary of the Universal Credit Act 2025:

*   **Purpose:** The Act's main purpose is to alter the payment rates for several key components of Universal Credit (UC) and the legacy income-related Employment and Support Allowance (ESA) for the tax years running from 2026-27 to 2029-30.

*   **Standard Allowance Uprating:** It places an obligation on the Secretary of State to increase the UC standard allowance annually. This increase is calculated by taking the previous year's amount, adding the Consumer Price Index (CPI) inflation rate, and then adding a further specific "uplift percentage" which is set in the Act for each of the four years.

*   **Two-Tier Sickness & Disability Element:** The Act fundamentally changes the Limited Capability for Work and Work-Related Activity (LCWRA) element by creating two different payment rates. A new, lower rate (£217.26) is introduced for most claimants who qualify for the LCWRA element on or after April 6, 2026

In [None]:
with open("task2.txt", "w", encoding="utf-8") as file:
    file.write(summary_text)

# Task 3

In [None]:
from pydantic import BaseModel, Field

class LegalExtraction(BaseModel):
    definitions: str = Field(default="", description="Key terms and their meanings")
    obligations: str = Field(default="", description="Duties for beneficiaries or others")
    responsibilities: str = Field(default="", description="Responsibilities of the administering authority")
    eligibility: str = Field(default="", description="Who can receive Universal Credit")
    payments: str = Field(default="", description="Entitlement and calculation rules")
    penalties: str = Field(default="", description="Enforcement measures and penalties")
    record_keeping: str = Field(default="", description="Reporting and documentation requirements")

model_with_structure = llm.with_structured_output(LegalExtraction)

prompt = f"""Extract the required information from the legal text.

If a field is not explicitly supported by information in the text, leave it empty.
Do NOT hallucinate or infer beyond the text.

Text to extract from:
{cleaned_text}
"""

result = model_with_structure.invoke(prompt)

In [None]:
import json
json_output = json.dumps(result.model_dump(), indent=4, ensure_ascii=False)
print(json_output)


{
    "definitions": "\"consumer prices index\": the all items consumer prices index published by the Statistics Board.\n\"pre-2026 claimant\": a claimant with limited capability for work and work-related activity who was entitled to an award of universal credit that included the LCWRA element before 6 April 2026 and has been continuously entitled since.\n\"relevant CPI percentage\": the percentage by which the consumer prices index for the September before the start of the tax year is higher than it was for the September before that (or 0% if it is not higher).\n\"relevant evidence\": evidence to suggest that a determination was made in ignorance of, or based on a mistake as to, some material fact, or that there has been a relevant change of circumstances in relation to the claimant’s physical or mental condition.\n\"relevant power\": the power in section 9(2) of the Welfare Reform Act 2012, or the power in section 150(2)(b) of the Social Security Administration Act 1992.\n\"severe co

In [None]:
json_output = json.dumps(result.model_dump(), indent=4, ensure_ascii=False)
with open("task3.json", "w", encoding="utf-8") as f:
    f.write(json_output)


# Task 4

In [None]:
from pydantic import BaseModel, Field

class RuleCheck(BaseModel):
    rule: str = Field(..., description="The rule being checked")
    status: str = Field(..., description="pass or fail")
    evidence: str = Field(..., description="Short reference/evidence from the Act")
    confidence: int = Field(..., description="Confidence in percentage (0-100)")


In [None]:
from pydantic import BaseModel, Field
from typing import List

class RuleCheck(BaseModel):
    rule: str = Field(..., description="The rule being checked")
    status: str = Field(..., description="pass or fail")
    evidence: str = Field(..., description="Short reference/evidence from the Act")
    confidence: int = Field(..., description="Confidence in percentage (0-100)")

class RuleCheckList(BaseModel):
    rules: List[RuleCheck]

model_with_rules = llm.with_structured_output(RuleCheckList)

rules_to_check = [
    "Act must define key terms",
    "Act must specify eligibility criteria",
    "Act must specify responsibilities of the administering authority",
    "Act must include enforcement or penalties",
    "Act must include payment calculation or entitlement structure",
    "Act must include record-keeping or reporting requirements"
]

prompt = f"""
Check each of the following rules against the provided legal text.

Rules:
{rules_to_check}

Legal text:
{cleaned_text}

For each rule, return:
- rule: The rule text
- status: pass or fail
- evidence: Short reference from the text (section, clause)
- confidence: 0-100, how confident you are

Do NOT hallucinate. If not mentioned, mark as fail.
"""

rules_result = model_with_rules.invoke(prompt)

In [None]:
rules_dict = rules_result.model_dump()
json_file_path = "task4.json"
with open(json_file_path, "w", encoding="utf-8") as f:
    json.dump(rules_dict, f, indent=4, ensure_ascii=False)