In [20]:
# chapter 1 
from bs4 import BeautifulSoup
import requests
import re

In [21]:
def removePageTags(html):
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script']):
        data.decompose()
    return ' '.join(soup.stripped_strings)

def getChapterNumbers(part):
    chapterNumberList = []
    partUrl = "https://uscode.house.gov/view.xhtml?req=granuleid%3AUSC-prelim-title18-part" + str(part) + "&saved=L3ByZWxpbUB0aXRsZTE4L3BhcnQxL2NoYXB0ZXIx%7CZ3JhbnVsZWlkOlVTQy1wcmVsaW0tdGl0bGUxOC1jaGFwdGVyMQ%3D%3D%7C%7C%7C0%7Cfalse%7Cprelim&edition=prelim"
    partHtml = requests.get(partUrl)
    soup = BeautifulSoup(partHtml.content, "html.parser")
    chapterTags = soup.find_all("h3", class_="chapter-head")
    for chapter in chapterTags:
        chapterNumber = re.findall(r'\d+[a-zA-Z]?', chapter.text)[0]
        chapterNumberList.append(chapterNumber)
    return chapterNumberList
        

def getChapterSectionsTitles(soup):
    chapterSectionTitles = []
    titleTags = soup.find_all("h3", class_="section-head")
    for title in titleTags:
        if not ("repealed" in title.text.lower()):
            chapterSectionTitles.append(title.text)
    return chapterSectionTitles 

def sectionInformation(chapterNumber, sectionNumber, legalSectionInformation):
    sectionUrl = "https://uscode.house.gov/view.xhtml?hl=false&edition=prelim&path=%2Fprelim%40title18%2Fpart1%2Fchapter" + str(chapterNumber) + "&req=granuleid%3AUSC-prelim-title18-section" + str(sectionNumber) + "&num=0&saved=L3ByZWxpbUB0aXRsZTE4L3BhcnQxL2NoYXB0ZXIx%7CZ3JhbnVsZWlkOlVTQy1wcmVsaW0tdGl0bGUxOC1jaGFwdGVyMQ%3D%3D%7C%7C%7C0%7Cfalse%7Cprelim"
    sectionHtml = requests.get(sectionUrl, timeout=20)
    soup = BeautifulSoup(sectionHtml.content, "html.parser")
    sectionTitle = soup.title.string
    sectionDescription = soup.find_all("p", class_=re.compile(r'statutory-body'))
    legalSectionDescription = ""
    for item in sectionDescription:
        legalSectionDescription += item.text
        legalSectionDescription += "\n"
    legalSectionInformation[sectionTitle] = legalSectionDescription

def getRelevantChapterContent(chapterNumber, legalSectionInformation):
    chapterUrl = "https://uscode.house.gov/view.xhtml?path=/prelim@title18/part1/chapter" + str(chapterNumber) + "&edition=prelim"
    chapterHtml = requests.get(chapterUrl, timeout=20)
    soup = BeautifulSoup(chapterHtml.content, "html.parser")
    chapterSectionTitles = getChapterSectionsTitles(soup)
    for sectionTitle in chapterSectionTitles:
        sectionNumber = re.findall(r'\d+[a-zA-Z]?', sectionTitle)[0]
        sectionInformation(chapterNumber, sectionNumber, legalSectionInformation)

In [22]:
legalSectionInformation = {}
chapterNumbers = getChapterNumbers(1)
for i in chapterNumbers:
    print(f"In Chapter: {i}")
    getRelevantChapterContent(i, legalSectionInformation)

In Chapter: 1
In Chapter: 2
In Chapter: 3
In Chapter: 5
In Chapter: 7
In Chapter: 9
In Chapter: 10
In Chapter: 11
In Chapter: 11A
In Chapter: 11B
In Chapter: 12
In Chapter: 13
In Chapter: 15
In Chapter: 17
In Chapter: 17A
In Chapter: 18
In Chapter: 19
In Chapter: 21
In Chapter: 23
In Chapter: 25
In Chapter: 26
In Chapter: 27
In Chapter: 29
In Chapter: 31
In Chapter: 33
In Chapter: 35
In Chapter: 37
In Chapter: 39
In Chapter: 40
In Chapter: 41
In Chapter: 42
In Chapter: 43
In Chapter: 44
In Chapter: 45
In Chapter: 46
In Chapter: 47
In Chapter: 49
In Chapter: 50
In Chapter: 50A
In Chapter: 51
In Chapter: 53
In Chapter: 55
In Chapter: 57
In Chapter: 59
In Chapter: 61
In Chapter: 63
In Chapter: 65
In Chapter: 67
In Chapter: 68
In Chapter: 69
In Chapter: 71
In Chapter: 73
In Chapter: 74
In Chapter: 75
In Chapter: 77
In Chapter: 79
In Chapter: 81
In Chapter: 83
In Chapter: 84
In Chapter: 85
In Chapter: 87
In Chapter: 88
In Chapter: 89
In Chapter: 90
In Chapter: 90A
In Chapter: 91
In Chapter:

In [23]:
file_path = 'part1Content.txt'
with open(file_path, 'w') as file:
    for key, value in legalSectionInformation.items():
        file.write(f"{key}:\n{value}\n")

In [27]:
import google.generativeai as genai
import os
from dotenv import load_dotenv, dotenv_values 
from langchain_community.llms import Ollama


def generateInstructionInput(extractedText):
    question_prompt = f"""
        Text: {extractedText}

        Instruction:
        Generate a question based only on the section's content. Do not give me multiple choice or true/false questions. 
        Please return only the question and no additional text.
        """
        
    model = Ollama(model="llama3.1:8b-instruct-q2_K")
    llmResponse = model.invoke(question_prompt)
    return llmResponse 

def generateInstructionOutput(extractedText, generated_question):
    answer_prompt = f"""
        Text: {extractedText}

        Instruction:
        Answer the following question based only on the section's content.

        Question: {generated_question}
        """
        
    model = Ollama(model="llama3.1:8b-instruct-q2_K")
    llmResponse = model.invoke(answer_prompt)
    return llmResponse

In [29]:
import ast 
import torch

if torch.backends.mps.is_available():
    print("mps used")
    mps_device = torch.device("mps")
else:
    mps_device = torch.device("cpu")
    
    
instructionTuneDataset = []
for key in legalSectionInformation.keys():
    sectionContent = f"Name of the Section: {key}\n Definition: {legalSectionInformation[key]}\n"
    # print(sectionContent)
    question = generateInstructionInput(sectionContent)
    answer = generateInstructionOutput(sectionContent, question)
    dataPoint = {"input": question, "output": answer}
    print(dataPoint)
    instructionTuneDataset.append(dataPoint)

mps used


KeyboardInterrupt: 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("instruction tune data/instructionTuneDataset.csv")

In [6]:
instructionTuneDataset  = result = [{"input": row['input'], "output": row['output']} for _, row in df.iterrows()]

In [10]:
import lamini
import os
from dotenv import load_dotenv, dotenv_values 

load_dotenv()
lamini.api_key = os.getenv("LAMINI_API_KEY")

In [13]:
from lamini import Lamini

llm = Lamini(model_name='meta-llama/Llama-2-7b-hf')
data = instructionTuneDataset
llm.tune(data_or_dataset_id=data)

Data pairs uploaded to local.

Your dataset id is: 116007eae54a7aa8d102c3d46a6985afe4488145952ace2caf55b4aaa655bdb6 . Consider using this in the future to train using the same data. 
Eg: llm.train(data_or_dataset_id='116007eae54a7aa8d102c3d46a6985afe4488145952ace2caf55b4aaa655bdb6')
Tuning job submitted! Check status of job 11793 here: https://api.lamini.ai/train/11793


{'job_id': 11793,
 'status': 'CREATED',
 'dataset_id': '116007eae54a7aa8d102c3d46a6985afe4488145952ace2caf55b4aaa655bdb6'}

In [23]:
print(llm.generate("What type of entity can own a vessel that is considered to be a vessel of the United States under 18 USC 9?"))


A corporation organized under the laws of a foreign country.
A corporation organized under the laws of a state of the United States.
A corporation organized under the laws of the United States.
A corporation organized under the laws of the District of Columbia.
A corporation organized under the laws of a foreign country is not considered to be a vessel of the United States under 18 USC 9.
A corporation organized under the laws of a state of the United States is not considered to be a vessel of the United States under 18 USC 9.
A corporation organized under the laws of the United States is not considered to be a vessel of the United States under 18 USC 9.
A corporation organized under the laws of the District of Columbia is not considered to be a vessel of the United States under 18 USC 9.
A corporation organized under the laws of a foreign country is considered to be a vessel of the United States under 18 USC 9.
A corporation organized under the laws of a state of the United States is

In [19]:
def get_output(input_value):
    for entry in data:
        if entry["input"] == input_value:
            return entry["output"]
    return "Output not found"


input_value = "What is included within the term \"special maritime and territorial jurisdiction of the United States\"?"
output = get_output(input_value)
print(output) 

The term "special maritime and territorial jurisdiction of the United States" includes:

(1) The high seas, any other waters within the admiralty and maritime jurisdiction of the United States and out of the jurisdiction of any particular State, and any vessel belonging in whole or in part to the United States or any citizen thereof, or to any corporation created by or under the laws of the United States, or of any State, Territory, District, or possession thereof, when such vessel is within the admiralty and maritime jurisdiction of the United States and out of the jurisdiction of any particular State.

(2) Any vessel registered, licensed, or enrolled under the laws of the United States, and being on a voyage upon the waters of any of the Great Lakes, or any of the waters connecting them, or upon the Saint Lawrence River where the same constitutes the International Boundary Line.

(3) Any lands reserved or acquired for the use of the United States, and under the exclusive or concurren