# Scraping text from Mississippi Appellate Court Opinion PDFs

### install pdfplumber

In [26]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Tom Smith\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


### import required packages

In [27]:
import pdfplumber
import os
import re
import pandas as pd
from itertools import compress
import numpy as np
from tqdm import tqdm
from datetime import datetime

### define important objects

In [28]:
years =  list(range(1996, 2023+1))

# Root directory will need to be set to wherever you stored the court opinions after using the justia_scrape function
root_dir = "C:\\Users\\Tom Smith\\Box\\Data\\justia_law\\court_opinions\\" 

### extract text from PDF documents
- Footnotes are removed.
- Page numbers are removed.

*Tested on Mississippi appellate court opinions, efficacy may very for other states and courts depending on formatting.*

In [29]:
def opinion_parser(court, years, root_dir):
    print("+++ " + str(datetime.now()) + " +++\n")
    print("//OPINIONPDFPARSER")

    for year in years:
        print("+++ " + str(year))
        docs = []
        files = os.listdir(root_dir + str(court) + "\\" + str(year))
        for file in tqdm(files):
            text = ''
            pages = []
            try:
                with pdfplumber.open(root_dir + str(court) + "\\"  + str(year) + "\\" + str(file)) as pdf:
                    for i, page in enumerate(pdf.pages):
                        rect_widths = [rect['bottom'] - rect['top'] for rect in page.rects]
                        rect_list = list(compress(page.rects, [width < 1 for width in rect_widths]))
                        if len(rect_list)>0:
                            bb = (page.bbox[0], page.bbox[1], page.bbox[2], rect_list[len(rect_list)-1]['top'])
                            page = page.crop(bbox=bb)
                            text = text+"\n"+str(page.extract_text())
                            pages.append(page)
                        elif page.page_number>1:
                            text = text+"\n"+str("\n".join(page.extract_text().split("\n")[:-1]))
                            pages.append(page)
                        else:
                            text = text+str(page.extract_text())
                            pages.append(page)
            except:
                pass
            docs.append((re.sub(".pdf", "", file).upper(), text))
        df = pd.DataFrame(docs, columns = ['id','text'])
        df.to_pickle(".\\data\\"+str(year)+"_"+str(court)+".pkl")

### Create list of all courts that have been scraped by the *justia_law* function

In [30]:
court_list = os.listdir(root_dir)
court_list = [x for x in court_list if bool(re.search('texas', x))]
court_list

### Sequentially create list of all available years for each court, this will be iterated through by the *opinion_parser* function

In [32]:
os.listdir(root_dir + court_list[0])

['2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023']

### Put it all together

In [None]:
for court in court_list:
    opinion_parser(court=court,
                   years=os.listdir(root_dir + court),
                   root_dir=root_dir)