# Scraping bankruptcy court cases

In [68]:
from bs4 import BeautifulSoup
import requests

response = requests.get('https://www.tnwb.uscourts.gov/Search/Search.aspx?zoom_sort=0&zoom_xml=0&zoom_query=CAR&zoom_per_page=200&zoom_and=1&zoom_cat%5B%5D=-1')
doc = BeautifulSoup(response.text, 'html.parser')

doc


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head id="Head1"><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>
	TNWB :: Search
</title>
<!-- This style sheet imports the other style sheets -->
<link charset="utf-8" href="../CSS/master.css" rel="stylesheet" type="text/css"/></head>
<body>
<!-- Toggle on/off the grid used for this site simply by uncommenting this div -->
<!-- <div id="grid"><img src="img/grid.png" alt="" width="1090" height="1380"></div> -->
<!-- <div id="grid"><img src="img/grid13col.png" alt="" width="940" height="1380"></div>  -->
<div class="wrapper">
<div id="header">
<div style="clear:both; float:right; margin-top:10px; margin-right:5px">
<form action="https://www.tnwb.uscourts.gov/Search/search.aspx" method="get">
<table cellpadding="0px" cellspacing="0px" style="height:40px; ">
<tr>
<td>
<input maxlength="30" name="

In [69]:
items = doc.select('.result_block, .result_altblock')
len(items)

132

In [70]:
# get url, case name, category, additional details

for item in items:
    print('------')
    #url
    print(item.find('a')['href'])
    # name
    print(item.find('a').text)
    # category
    print(item.find(class_='category').text.strip())
    # addl details
    # print(item.find(class_='infoline').text)
    # separate details using .split, but first change \xa0 into regular spaces
    details_text = item.find(class_='infoline').text.replace('\xa0', ' ')
    details = details_text.split('  -  ')
    # print(details)
    terms_match = details[0].strip()
    size = details[1].strip()
    pdf_url = details[2].strip()
    print(terms_match)
    print(size)
    print(pdf_url)

------
https://www.tnwb.uscourts.gov/Opinions/jdl/pdf/jdl20071024nn1.pdf#search=%22car%22
JDL: 04-24318 Jacquelline D. Black
[Judges' Opinions]
Terms matched:  1
102k
URL: https://www.tnwb.uscourts.gov/Opinions/jdl/pdf/jdl20071024nn1.pdf
------
https://www.tnwb.uscourts.gov/Opinions/whb/pdf/whb19950809xn1.pdf#search=%22car%22
WHB: 95-26401 Mary Lucy Cooper
[Judges' Opinions]
Terms matched:  1
27k
URL: https://www.tnwb.uscourts.gov/Opinions/whb/pdf/whb19950809xn1.pdf
------
https://www.tnwb.uscourts.gov/Opinions/ghb/pdf/ghb19980812xn1.pdf#search=%22car%22
GHB: 97-12368 Billy G. Woffard
[Judges' Opinions]
Terms matched:  1
71k
URL: https://www.tnwb.uscourts.gov/Opinions/ghb/pdf/ghb19980812xn1.pdf
------
https://www.tnwb.uscourts.gov/Opinions/jdl/pdf/jdl19970918pn1.pdf#search=%22car%22
JDL: 97-30580 Mary Chrlis Hurst
[Judges' Opinions]
Terms matched:  1
32k
URL: https://www.tnwb.uscourts.gov/Opinions/jdl/pdf/jdl19970918pn1.pdf
------
https://www.tnwb.uscourts.gov/Opinions/mrh/pdf/mrh20220

In [71]:
rows = []

for item in items:
    row = {}
    row['url'] = item.find('a')['href']
    row['name'] = item.find('a').text
    row['category'] = item.find(class_='category').text.strip()
    details_text = item.find(class_='infoline').text.replace('\xa0', ' ')
    details = details_text.split('  -  ')
    row['terms matched'] = details[0].strip()
    row['size'] = details[1].strip()
    row['pdf url'] = details[2].strip()

    rows.append(row)

In [72]:
import pandas as pd

df = pd.json_normalize(rows)
df.head()

Unnamed: 0,url,name,category,terms matched,size,pdf url
0,https://www.tnwb.uscourts.gov/Opinions/jdl/pdf...,JDL: 04-24318 Jacquelline D. Black,[Judges' Opinions],Terms matched: 1,102k,URL: https://www.tnwb.uscourts.gov/Opinions/jd...
1,https://www.tnwb.uscourts.gov/Opinions/whb/pdf...,WHB: 95-26401 Mary Lucy Cooper,[Judges' Opinions],Terms matched: 1,27k,URL: https://www.tnwb.uscourts.gov/Opinions/wh...
2,https://www.tnwb.uscourts.gov/Opinions/ghb/pdf...,GHB: 97-12368 Billy G. Woffard,[Judges' Opinions],Terms matched: 1,71k,URL: https://www.tnwb.uscourts.gov/Opinions/gh...
3,https://www.tnwb.uscourts.gov/Opinions/jdl/pdf...,JDL: 97-30580 Mary Chrlis Hurst,[Judges' Opinions],Terms matched: 1,32k,URL: https://www.tnwb.uscourts.gov/Opinions/jd...
4,https://www.tnwb.uscourts.gov/Opinions/mrh/pdf...,MRH: 20-20967 Jacob Braxton Herring 20-00094,[Judges' Opinions],Terms matched: 1,303k,URL: https://www.tnwb.uscourts.gov/Opinions/mr...


In [73]:
df.to_csv("court-cases.csv", index=False)