# Scraping text from Mississippi Appellate Court Opinion PDFs

## install pdfplumber

In [13]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


## import required packages

In [1]:
import pdfplumber
import os
import re
import pandas as pd
from itertools import compress
import numpy as np
from tqdm import tqdm
from datetime import datetime

## define important objects

In [2]:
years =  list(range(1996, 2023+1))

root_dir = "..\\data\\court_opinions\\"

## extract text from PDF documents
- Footnotes are removed.
- Page numbers are removed.

In [3]:
def opinion_parser(years, root_dir):
    print("+++ " + str(datetime.now()) + " +++\n")
    print("//OPINIONPDFPARSER")

    for year in years:
        print("+++ " + str(year))
        docs = []
        files = os.listdir(root_dir + str(year))
        for file in tqdm(files):
            text = ''
            pages = []
            with pdfplumber.open(root_dir + str(year) + "\\" + str(file)) as pdf:
                for i, page in enumerate(pdf.pages):
                    rect_widths = [rect['bottom'] - rect['top'] for rect in page.rects]
                    rect_list = list(compress(page.rects, [width < 1 for width in rect_widths]))
                    if len(rect_list)>0:
                        bb = (page.bbox[0], page.bbox[1], page.bbox[2], rect_list[len(rect_list)-1]['top'])
                        page = page.crop(bbox=bb)
                        text = text+"\n"+str(page.extract_text())
                        pages.append(page)
                    elif page.page_number>1:
                        text = text+"\n"+str("\n".join(page.extract_text().split("\n")[:-1]))
                        pages.append(page)
                    else:
                        text = text+str(page.extract_text())
                        pages.append(page)
            docs.append((re.sub(".pdf", "", file).upper(), text))
        df = pd.DataFrame(docs, columns = ['id','text'])
        df.to_pickle("..\\data\\"+str(year)+"_Mississippi_Appellate_Court_Decisions.pkl")
        
opinion_parser(years=years, root_dir=root_dir)

+++ 2023-11-01 21:07:51.416584 +++

//OPINIONPDFPARSER
+++ 1996


  0%|          | 0/23 [00:00<?, ?it/s]

100%|██████████| 23/23 [00:08<00:00,  2.84it/s]


+++ 1997


100%|██████████| 21/21 [00:07<00:00,  2.76it/s]


+++ 1998


100%|██████████| 31/31 [00:16<00:00,  1.83it/s]


+++ 1999


100%|██████████| 73/73 [00:32<00:00,  2.23it/s]


+++ 2000


100%|██████████| 450/450 [03:14<00:00,  2.31it/s]


+++ 2001


100%|██████████| 502/502 [03:33<00:00,  2.35it/s]


+++ 2002


100%|██████████| 499/499 [02:59<00:00,  2.78it/s]


+++ 2003


100%|██████████| 537/537 [03:28<00:00,  2.58it/s]


+++ 2004


100%|██████████| 576/576 [04:19<00:00,  2.22it/s]


+++ 2005


100%|██████████| 649/649 [05:40<00:00,  1.91it/s]


+++ 2006


100%|██████████| 615/615 [07:01<00:00,  1.46it/s]


+++ 2007


100%|██████████| 563/563 [08:42<00:00,  1.08it/s]


+++ 2008


100%|██████████| 526/526 [07:20<00:00,  1.19it/s]


+++ 2009


100%|██████████| 549/549 [07:55<00:00,  1.16it/s]


+++ 2010


100%|██████████| 443/443 [07:20<00:00,  1.01it/s]


+++ 2011


100%|██████████| 394/394 [07:52<00:00,  1.20s/it]


+++ 2012


100%|██████████| 497/497 [09:55<00:00,  1.20s/it]


+++ 2013


100%|██████████| 590/590 [10:45<00:00,  1.09s/it]


+++ 2014


100%|██████████| 472/472 [08:43<00:00,  1.11s/it]


+++ 2015


100%|██████████| 434/434 [07:37<00:00,  1.05s/it]


+++ 2016


100%|██████████| 493/493 [08:13<00:00,  1.00s/it]


+++ 2017


100%|██████████| 469/469 [07:20<00:00,  1.07it/s]


+++ 2018


100%|██████████| 409/409 [05:18<00:00,  1.29it/s]


+++ 2019


100%|██████████| 409/409 [06:37<00:00,  1.03it/s]


+++ 2020


100%|██████████| 409/409 [07:24<00:00,  1.09s/it]


+++ 2021


100%|██████████| 318/318 [06:14<00:00,  1.18s/it]


+++ 2022


100%|██████████| 303/303 [05:17<00:00,  1.05s/it]


+++ 2023


100%|██████████| 242/242 [03:57<00:00,  1.02it/s]
