# Scraping Mississippi Appellate Court Opinion PDFs from law.justia.com

Data for the grant are collected from law.justia.com, a repository of court opinions across the United States. <br>
Mississippi Court of Appeals opinions can be found at URLs with the following pattern: https://law.justia.com/cases/mississippi/court-of-appeals/2023/
- /cases/ targets justia's case repository
- /mississippi/ targets the state of Mississippi
- /court-of-appeals/ targets the Court of Appeals in the target state
- /2023/ narrows the cases to a single year, in this case 2023 [this is necessary to ensure all cases are located on the same page, rather than across multiple pages]

When generalizing the code below to collect data from all available years, we will need a 'year' object consisting of a list of all available years. Using this object you can 'iterate' the code through all possible versions of this URL with a Python 'for loop'. 

Start out by testing this code to better understand how these loops operate:

    years = [2022,2021,2020]
    root_url = "https://law.justia.com/cases/mississippi/court-of-appeals/"
    for year in years:
         print(years + root_url + "/")

## install beautifulsoup4 (and other important libraries if you need them!)

In [None]:
import os
os.getcwd()

In [None]:
# pip install beautifulsoup4

## import required packages

In [None]:
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm
import re
import pandas as pd
import shutil

## collecting the citations for all 2023 cases
### scrape the source code from the target web page using beautifulsoup4

In [None]:
url

In [None]:
year = 2020
url = "https://law.justia.com/cases/mississippi/court-of-appeals/" + str(year) + "/"
req = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
soup = BeautifulSoup(req.text, "html.parser")

In [None]:
req

In [None]:
pages = soup.find_all("span", {"class": "pagination page"})
pages = [page.find_all("a", href=True) for page in pages]
pages = [page[0].get('href') if len(page) > 0 else '' for page in pages]
pages = [page for page in pages if page]
pages = [i for n, i in enumerate(pages) if i not in pages[:n]]

In [None]:
urls = [url] + ["https://law.justia.com" + page for page in pages]

In [None]:
reqs = [requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) for url in urls]

### locate the citations in the raw html text by targeting '\<span\>' HTML tags with the class 'justia-citation'

Then use the get_text() function from beautifulsoup4 to clean the HTML code, leaving you with just the citation text.

In [None]:
opinions = soup.find_all("div", {"class": "has-padding-content-block-30 -zb"})

In [None]:
citations = [opinion.find_all("span", {"class": "justia-citation"}) for opinion in opinions]
citations = [citation[0].get_text() for citation in citations]

### locate the pdf URLs by targeting the '\<href\>' tags with the class 'case-name'

In [None]:
links = [opinion.find_all("a", {"class": "case-name"}, href=True) for opinion in opinions]
links = ['https://law.justia.com' + link[0].get('href') if len(link) > 0 else 'No Link' for link in links]

### now take all of this and loop it through all pages of opinions

In [None]:
citations = []
links = []
print("Parsing " + str(len(reqs)) + " Pages")
for req in reqs:
    soup = BeautifulSoup(req.text, "html.parser")
    opinions = soup.find_all("div", {"class": "has-padding-content-block-30 -zb"})

    temp = [opinion.find_all("span", {"class": "justia-citation"}) for opinion in opinions]
    temp = [citation[0].get_text() for citation in temp]
    citations = citations + temp

    temp = [opinion.find_all("a", {"class": "case-name"}, href=True) for opinion in opinions]
    temp = ['https://law.justia.com' + link[0].get('href') if len(link) > 0 else 'No Link' for link in temp]
    links = links + temp

### create data frame consisting of two columns: citation and URL to opinion PDF
Drop all cases with no pdf.<br>Print the number of collected and dropped cases.

In [None]:
df = pd.DataFrame([{'citation': citation, 'url': link} for citation, link in zip(citations, links)])
r2 = 'Dropped: ' + str(len(df[df['url'] == 'No Link']))
df = df.drop(df[df['url'] == 'No Link'].index)
r1 = 'Collected: ' + str(len(df))
print(str(year))
print(r1)
print(r2)

Now that we have scraped all of the metadata, we need to download all available PDFs for these cases. To do this, we need to individually query each case's URL and download the PDF from the associated web page (if one is available).

In [None]:
print(os.getcwd()) # check your current working directory with the 'os' library
shutil.rmtree("..\\data\\court_opinions\\" + str(year)) # delete the working directory we are about to create
os.mkdir("..\\data\\court_opinions\\" + str(year)) # create a working directory where you can save the PDFs, the leading '..' takes you up a level.

In [None]:
for url, citation in tqdm(zip(df.url, df.citation)):
    req = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
    soup = BeautifulSoup(req.text, "html.parser")
    link = soup.find_all("a", {"class": "pdf-icon pull-right has-margin-bottom-20"}, href=True)[0]
    link = "https:" + link.get('href')
    response = requests.get(link, 'wb') 
    if response.status_code==200:
        pdf = open("..\\data\\court_opinions\\" + str(year) + "\\" + str(citation) + ".pdf", "wb")
        pdf.write(response.content)
        pdf.close()
    else: 
        print('Error: ' + 
              re.sub(".pdf$", "", str(citation)).upper() + 
              ' aborted with ' + 
              str(response.status_code) + 
              ' status')

Next we need to get ALL available years. <br> This will require that we put all of this code into a function, and iterate it through all years.

In [None]:
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm
import re
import pandas as pd
import shutil
from datetime import datetime

def justia_scrape(years, state, court):
    print("+++ " + str(datetime.now()) + " +++\n")
    print("//LAWJUSTIASCRAPER")
    print("//"+state.upper()+"/"+court.upper()+"\n")
    
    for year in years:
        print("+++ " + str(year))
        
        url = "https://law.justia.com/cases/" + state.lower() + "/" + court.lower() + "/" + str(year) + "/"
        req = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
        soup = BeautifulSoup(req.text, "html.parser")
        
        pages = soup.find_all("span", {"class": "pagination page"})
        pages = [page.find_all("a", href=True) for page in pages]
        pages = [page[0].get('href') if len(page) > 0 else '' for page in pages]
        pages = [page for page in pages if page]
        pages = [i for n, i in enumerate(pages) if i not in pages[:n]]
        
        urls = [url] + ["https://law.justia.com" + page for page in pages]
        reqs = [requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) for url in urls]
        
        print("Parsing: " + str(len(reqs)) + " page(s)")

        citations = []
        links = []
        for req in reqs:
            soup = BeautifulSoup(req.text, "html.parser")
            opinions = soup.find_all("div", {"class": "has-padding-content-block-30 -zb"})

            temp = [opinion.find_all("span", {"class": "justia-citation"}) for opinion in opinions]
            temp = [citation[0].get_text() for citation in temp]
            citations = citations + temp

            temp = [opinion.find_all("a", {"class": "case-name"}, href=True) for opinion in opinions]
            temp = ['https://law.justia.com' + link[0].get('href') if len(link) > 0 else 'No Link' for link in temp]
            links = links + temp
            

        df = pd.DataFrame([{'citation': citation, 'url': link} for citation, link in zip(citations, links)])
        r2 = 'Dropping: ' + str(len(df[df['url'] == 'No Link']))
        df = df.drop(df[df['url'] == 'No Link'].index)
        r1 = 'Collecting: ' + str(len(df))

        print(r1)
        print(r2)

        os.mkdir("..\\data\\court_opinions\\" + str(year))

        for url, citation in tqdm(zip(df.url, df.citation)):
            req = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
            soup = BeautifulSoup(req.text, "html.parser")
            link = soup.find_all("a", {"class": "pdf-icon pull-right has-margin-bottom-20"}, href=True)[0]
            link = "https:" + link.get('href')
            response = requests.get(link, 'wb') 
            if response.status_code==200:
                pdf = open("..\\data\\court_opinions\\" + str(year) + "\\" + str(citation) + ".pdf", "wb")
                pdf.write(response.content)
                pdf.close()
            else: 
                print('Error: ' + 
                    re.sub(".pdf$", "", str(citation)).upper() + 
                    ' aborted with ' + 
                    str(response.status_code) + 
                    ' status')
        print(" ")
                
years = list(range(2004, 2023+1))
state = 'mississippi' # can be replaced with a string value for any state
court = 'court-of-appeals' # state dependent, for Mississippi this function will accept 'court-of-appeals' or 'supreme-court', refer to URLs for case law courts

In [None]:
justia_scrape(years=years, state=state, court=court)