# Scraping text from Mississippi Appellate Court Opinion PDFs

## install pdfplumber

In [None]:
pip install pdfplumber

## import required packages

In [None]:
import pdfplumber
import os
import re
import pandas as pd
from itertools import compress
import numpy as np
from tqdm import tqdm
from datetime import datetime

## define important objects

In [None]:
years =  list(range(1996, 2023+1))

root_dir = "..\\data\\court_opinions\\"

## extract text from PDF documents
- Footnotes are removed.
- Page numbers are removed.

In [None]:
def opinion_parser(years, root_dir):
    print("+++ " + str(datetime.now()) + " +++\n")
    print("//OPINIONPDFPARSER")

    for year in years:
        print("+++ " + str(year))
        docs = []
        files = os.listdir(root_dir + str(year))
        for file in tqdm(files):
            text = ''
            pages = []
            with pdfplumber.open(root_dir + str(year) + "\\" + str(file)) as pdf:
                for i, page in enumerate(pdf.pages):
                    rect_widths = [rect['bottom'] - rect['top'] for rect in page.rects]
                    rect_list = list(compress(page.rects, [width < 1 for width in rect_widths]))
                    if len(rect_list)>0:
                        bb = (page.bbox[0], page.bbox[1], page.bbox[2], rect_list[len(rect_list)-1]['top'])
                        page = page.crop(bbox=bb)
                        text = text+"\n"+str(page.extract_text())
                        pages.append(page)
                    elif page.page_number>1:
                        text = text+"\n"+str("\n".join(page.extract_text().split("\n")[:-1]))
                        pages.append(page)
                    else:
                        text = text+str(page.extract_text())
                        pages.append(page)
            docs.append((re.sub(".pdf", "", file).upper(), text))
        df = pd.DataFrame(docs, columns = ['id','text'])
        df.to_pickle("..\\data\\"+str(year)+"_Mississippi_Appellate_Court_Decisions.pkl")
        
opinion_parser(years=years, root_dir=root_dir)