In [20]:
# chapter 1 
from bs4 import BeautifulSoup
import requests
import re

In [80]:
def removePageTags(html):
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script']):
        data.decompose()
    return ' '.join(soup.stripped_strings)

def getChapterNumbers(part):
    chapterNumberList = []
    partUrl = "https://uscode.house.gov/view.xhtml?req=granuleid%3AUSC-prelim-title18-part" + str(part) + "&saved=L3ByZWxpbUB0aXRsZTE4L3BhcnQxL2NoYXB0ZXIx%7CZ3JhbnVsZWlkOlVTQy1wcmVsaW0tdGl0bGUxOC1jaGFwdGVyMQ%3D%3D%7C%7C%7C0%7Cfalse%7Cprelim&edition=prelim"
    partHtml = requests.get(partUrl)
    soup = BeautifulSoup(partHtml.content, "html.parser")
    chapterTags = soup.find_all("h3", class_="chapter-head")
    for chapter in chapterTags:
        chapterNumber = re.findall(r'\d+[a-zA-Z]?', chapter.text)[0]
        chapterNumberList.append(chapterNumber)
    return chapterNumberList
        

def getChapterSectionsTitles(soup):
    chapterSectionTitles = []
    titleTags = soup.find_all("h3", class_="section-head")
    for title in titleTags:
        if not ("repealed" in title.text.lower()):
            chapterSectionTitles.append(title.text)
    return chapterSectionTitles 

def sectionInformation(chapterNumber, sectionNumber, legalSectionInformation):
    sectionUrl = "https://uscode.house.gov/view.xhtml?hl=false&edition=prelim&path=%2Fprelim%40title18%2Fpart1%2Fchapter" + str(chapterNumber) + "&req=granuleid%3AUSC-prelim-title18-section" + str(sectionNumber) + "&num=0&saved=L3ByZWxpbUB0aXRsZTE4L3BhcnQxL2NoYXB0ZXIx%7CZ3JhbnVsZWlkOlVTQy1wcmVsaW0tdGl0bGUxOC1jaGFwdGVyMQ%3D%3D%7C%7C%7C0%7Cfalse%7Cprelim"
    sectionHtml = requests.get(sectionUrl, timeout=20)
    soup = BeautifulSoup(sectionHtml.content, "html.parser")
    sectionTitle = soup.title.string
    sectionDescription = soup.find_all("p", class_=re.compile(r'statutory-body'))
    legalSectionDescription = ""
    for item in sectionDescription:
        legalSectionDescription += item.text
        legalSectionDescription += "\n"
    legalSectionInformation[sectionTitle] = legalSectionDescription

def getRelevantChapterContent(chapterNumber, legalSectionInformation):
    chapterUrl = "https://uscode.house.gov/view.xhtml?path=/prelim@title18/part1/chapter" + str(chapterNumber) + "&edition=prelim"
    chapterHtml = requests.get(chapterUrl, timeout=20)
    soup = BeautifulSoup(chapterHtml.content, "html.parser")
    chapterSectionTitles = getChapterSectionsTitles(soup)
    for sectionTitle in chapterSectionTitles:
        sectionNumber = re.findall(r'\d+[a-zA-Z]?', sectionTitle)[0]
        sectionInformation(chapterNumber, sectionNumber, legalSectionInformation)

In [84]:
legalSectionInformation = {}
chapterNumbers = getChapterNumbers(1)
for i in chapterNumbers:
    print(f"In Chapter: {i}")
    getRelevantChapterContent(i, legalSectionInformation)

In Chapter: 1
In Chapter: 2
In Chapter: 3
In Chapter: 5
In Chapter: 7
In Chapter: 9
In Chapter: 10
In Chapter: 11
In Chapter: 11A
In Chapter: 11B
In Chapter: 12
In Chapter: 13
In Chapter: 15
In Chapter: 17
In Chapter: 17A
In Chapter: 18
In Chapter: 19
In Chapter: 21
In Chapter: 23
In Chapter: 25
In Chapter: 26
In Chapter: 27
In Chapter: 29
In Chapter: 31
In Chapter: 33
In Chapter: 35
In Chapter: 37
In Chapter: 39
In Chapter: 40
In Chapter: 41
In Chapter: 42
In Chapter: 43
In Chapter: 44
In Chapter: 45
In Chapter: 46
In Chapter: 47
In Chapter: 49
In Chapter: 50
In Chapter: 50A
In Chapter: 51
In Chapter: 53
In Chapter: 55
In Chapter: 57
In Chapter: 59
In Chapter: 61
In Chapter: 63
In Chapter: 65
In Chapter: 67
In Chapter: 68
In Chapter: 69
In Chapter: 71
In Chapter: 73
In Chapter: 74
In Chapter: 75
In Chapter: 77
In Chapter: 79
In Chapter: 81
In Chapter: 83
In Chapter: 84
In Chapter: 85
In Chapter: 87
In Chapter: 88
In Chapter: 89
In Chapter: 90
In Chapter: 90A
In Chapter: 91
In Chapter:

In [88]:
file_path = 'part1Content.txt'
with open(file_path, 'w') as file:
    for key, value in legalSectionInformation.items():
        file.write(f"{key}:\n{value}\n")

In [137]:
import google.generativeai as genai
import os
from dotenv import load_dotenv, dotenv_values 
from langchain_community.llms import Ollama

def generateGeminiResponse(extractedText):
    # load_dotenv()
    # genai.configure(api_key = os.getenv("GEMINI_KEY"))
    prompt = f"""
        Text:
        {extractedText}

        Instruction:
        For the section provided in the text, do the following:
        1. Generate a question based only on the section's content.
        2. Provide the answer to the question, based strictly on the section's information.
        3. Format the response as a list (not JSON), where each item is a dictionary with:
            - A key "input" containing the generated question.
            - A key "output" containing the corresponding answer.

        **Do not include any other formatting** like JSON syntax (i.e., no `"` or `,` at the end of lines) and do not include any additional text or explanations. Only return the list of dictionaries.

        Example output:
        [
            {{
                'input': 'What is the maximum penalty for being an accessory after the fact under Section 3?',
                'output': 'An accessory after the fact can be imprisoned for up to half of the principal's punishment, or up to 15 years if the principal is punishable by life imprisonment or death.'
            }},
            {{
                'input': 'What is defined as "United States" in Section 5 of Title 18?',
                'output': 'The term "United States" includes all places and waters subject to U.S. jurisdiction, except the Canal Zone.'
            }}
        ]
    """

    # model = genai.GenerativeModel("gemini-1.5-flash")
    # llmResponse = model.generate_content(prompt)
    model = Ollama(model="mistral:7b-instruct-q2_K")
    llmResponse = model.invoke(prompt)
    return llmResponse

In [140]:
import ast 
import torch

if torch.backends.mps.is_available():
    print("mps used")
    mps_device = torch.device("mps")
else:
    mps_device = torch.device("cpu")
    
    
instructionTuneDataset = []
for key in legalSectionInformation.keys():
    sectionContent = f"Title: {key}\nContent: {legalSectionInformation[key]}\n"
    # print(sectionContent)
    llmResponse = generateGeminiResponse(sectionContent)
    llmResponse = ast.literal_eval(llmResponse)
    for i in llmResponse:
        instructionTuneDataset.append(i)

mps used


KeyboardInterrupt: 

In [141]:
instructionTuneDataset

[{'input': 'What is the definition of a principal under Section 2 of Title 18?',
  'output': 'A principal under Section 2 of Title 18 is an individual who commits an offense against the United States or assists in its commission, or causes an act to be done which if directly performed by him or another would be an offense against the United States.'},
 {'input': 'Is there a distinction between being an accessory after the fact and aiding and abetting under Section 2 of Title 18?',
  'output': 'No, there is no distinction between being an accessory after the fact and aiding and abetting under Section 2 of Title 18. Both are punishable as accessories.'},
 {'input': 'What is the punishment for being an accessory after the fact under Section 3?',
  'output': "An accessory after the fact can be imprisoned for up to half of the principal's punishment, or up to 15 years if the principal is punishable by life imprisonment or death."},
 {'input': 'What is the penalty for misprision of felony?',