# Scraping data from MD Case Search

In [59]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time, datetime, os
from pathlib import Path
import string

## Create folder with today's date
today = datetime.date.today()  
todaystr = today.isoformat()   
dir_path = os.getcwd()
save_dir = dir_path + "/" + todaystr
if not os.path.exists(save_dir):
    os.mkdir(save_dir)


## Set the target save location and Firefox preferences to auto download csv files
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", save_dir)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")

## Create the Firefox session
driver = webdriver.Firefox(firefox_profile=profile)
## Get past the disclaimer splash screen
driver.get("http://casesearch.courts.state.md.us/casesearch/inquiry-index.jsp")
check = driver.find_element_by_name("disclaimer")
check.click()
continue_button = driver.find_element_by_name("action")
continue_button.click()

assert "No results found." not in driver.page_source



#### Remember to insert dates

In [102]:
## The search page won't let you search for all cases, need to do 26 searches (A-Z)
county_list = ['Charles County', "Saint Mary's County", 'Calvert County']

for county in county_list
    for last_initial in string.ascii_lowercase:
        try:
            ## Enter search criteria
            last_name = driver.find_element_by_name("lastName")
            last_name.clear()
            last_name.send_keys(last_initial)
            driver.find_element_by_xpath("//select[@name='partyType']/option[text()='Plaintiff']").click()
            driver.find_element_by_xpath("//input[@name='site' and @value='CIVIL']").click()
            driver.find_element_by_xpath("//input[@name='courtSystem' and @value='D']").click()
            driver.find_element_by_xpath("//select[@name='countyName']/option[text()=" + county + "]").click()
            last_name = driver.find_element_by_name("filingStart")
            last_name.clear()
            last_name.send_keys("6/1/2019")  ## Turn to variable
            last_name = driver.find_element_by_name("filingEnd")
            last_name.clear()
            last_name.send_keys("9/30/2019")  ## Turn to variable
            driver.find_element_by_xpath("//input[@name='action' and @value='Search']").click()

            element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.LINK_TEXT, "CSV"))).click() ## need to add logic for when X isn't found
            driver.back()
            time.sleep(15)
        except:
            print('No cases for plantiffs starting with ', last_initial)
    

TimeoutException: Message: 


# Concatenate csv exports to dataframe

In [6]:
os.chdir(save_dir)
os.getcwd()

'/Users/joshuahogge/Projects/tiredLandlords/2019-09-24'

In [135]:
import csv
import glob

# Adding a header to each file
path = r'/Users/joshuahogge/Projects/tiredLandlords/2019-09-24' # use your path
all_files = glob.glob(path + "/*.csv")

for filename in all_files:
    with open(filename, newline='') as f:
        r = csv.reader(f)
        data = [line for line in r]
    with open(filename,'w',newline='') as f:
        w = csv.writer(f)
        w.writerow(['Case Number', 'Name', 'Date of Birth', 'Party Type', 'Court', 
                        'Case Type', 'Case Status', 'Filing Date', 'Case Caption'])
        w.writerows(data)

In [136]:
import pandas as pd
# Concatenate csv to single dataframe
df = pd.concat([pd.read_csv(f, index_col=0) for f in all_files])
df.shape

(2129, 8)

# Filtering down to the good stuff

In [137]:
df.shape

(2129, 8)

In [138]:
da = df[df['Case Type'] == 'FTPR'] # Failure to Pay Rent
da

Unnamed: 0_level_0,Name,isPerson,Party Type,Court,Case Type,Case Status,Filing Date,Case Caption
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
D042LT18007202,"Mason, Christy",,Plaintiff,Charles County District Court,FTPR,Open,08/26/2019,"CHRISTY MASON vs. STEVE MCFARLANE, et al."
D042LT19002704,"Md Property Management, Llc",,Plaintiff,Charles County District Court,FTPR,Open,08/06/2019,"MD PROPERTY MANAGEMENT, LLC vs. BREANNA PARKER..."
D042LT19002670,"Bailey, Vincent",,Plaintiff,Charles County District Court,FTPR,Open,08/05/2019,"VINCENT BAILEY vs. BROOKE GRAY, et al."
D042LT19002042,"Banks, Curtis",,Plaintiff,Charles County District Court,FTPR,Closed,08/02/2019,CURTIS BANKS vs. KENDALL BROOKS
D042LT19002626,"Beckham, Zachriell",,Plaintiff,Charles County District Court,FTPR,Closed,08/08/2019,"ZACHRIELL BECKHAM vs. STEPHANIE VAUGHN, et al."
...,...,...,...,...,...,...,...,...
D042LT19003008,"Lee, Dana",,Plaintiff,Charles County District Court,FTPR,Closed,08/22/2019,DANA LEE vs. JUNITA FRIPP
D042LT19001602,"Legg, Daniel S",,Plaintiff,Charles County District Court,FTPR,Closed,07/30/2019,"DANIEL LEGG vs. AYRA JACKSON, et al."
D042LT19002043,Catholic Charites Of The Archdicese Of Washington,,Plaintiff,Charles County District Court,FTPR,Closed,08/02/2019,CATHOLIC CHARITES OF THE ARCHDICESE OF WASHING...
D042LT19002183,Coachman's Landing,,Plaintiff,Charles County District Court,FTPR,Closed,08/29/2019,"COACHMAN'S LANDING vs. LASHAWN THOMAS, et al."


# Getting case data

In [60]:
# case_data_df = pd.DataFrame()

# for index, row in da.iterrows():
#     print(index, row['Court'])

# Enter case data on query page
# court = row['Court'].split(' ')[0]
court = 'Charles'
driver.find_element_by_xpath("//select[@name='locationCode']/option[contains(text(), '%s')]" % court).click()
case_number = driver.find_element_by_name("caseId")
case_number.clear()
# case_number.send_keys(str(da['Case Number'][0])) ## Change to variable
case_number.send_keys('D042LT18007202')
try:
    driver.find_element_by_xpath("//input[@name='action' and @value='Get Case']").click()
except:
    print('Case number %s not found' % str(da['Case Number'][2])) ## Change to same variable above
    
# print(driver.page_source)



# # Mine data
# case_data = []
# for tr in driver.find_elements_by_xpath('//table//tr'):
#     tds = tr.find_elements_by_tag_name('td')
#     print([td.text for td in tds])
#     case_data.append([td.text for td in tds])
# case_clean = []
# # Cleaning out header info
# [case_clean.append(a) for a in case_data if len(a) == 2]
# # Casting do dict for import into pandas
# case_dict = {d[0]: d[1:] for d in case_clean}
# print(case_dict)



['CaseSearch', 'District Court of Maryland', '']

['Case Information']

['Court System:', 'District Court For Charles County - Civil']

['Location:', 'Charles']

['Case Number:', 'D-042-LT-18-007202']

['Title:', 'CHRISTY MASON vs. STEVE MCFARLANE, et al.']

['Case Type:', 'Failure to Pay Rent']

['Filing Date:', '08/26/2019']

['Case Status:', 'Open']

[]

['Involved Parties Information']

['Name:', 'MASON, CHRISTY']

['Address:', '13535 WAVERLY RD']

['City:', 'NEWBURGState:MDZip Code:20664']

['Name:', 'MCFARLANE, STEVE']

['Address:', '2165 PINEVIEW CT']

['City:', 'WALDORFState:MDZip Code:20602']

['Name:', 'WASHINGTON, DEBBIE']

['Address:', '2165 PINEVIEW CT']

['City:', 'WALDORFState:MDZip Code:20602']

['Document Information']

['File Date:', '11/19/2018']

['Filed By:', '']

['Document Name:', 'Complaint / Petition - Landlord Tenant']

['Comment:', '']

['File Date:', '12/10/2018']

['Filed By:', '']

['Document Name:', 'Landlord Tenant Disposition']

['Comment:', '']
{'Cour

In [93]:
# Selenium is bad at finding data in relation to headers, so this is Beautiful Soup finding all of
# the landlord names, which will help pick out the landlor info after the table data is scraped in the next step.
landlords = []
for h5 in soup.find_all('h5'):
    if h5.text == 'Landlord / Plaintiff':
        landlords.append(h5.find_next('span', 'Value').contents)
print(landlords)

[['MASON, CHRISTY']]


In [147]:
# Mine data
case_data = []
for tr in driver.find_elements_by_xpath('//table//tr'):
    tds = tr.find_elements_by_tag_name('td')
    case_data.append([td.text for td in tds])

case_clean = []
# Cleaning out header info
[case_clean.append(a) for a in case_data if len(a) == 2]
num_names = 0
repeated_info = ['Name:', 'Address:', 'City:']
for i in case_clean:
    if i[0] in repeated_info:
        i[0] = i[0] + str(num_names)
    num_names += 1
print(case_clean)
# Casting do dict for import into pandas
case_dict = {d[0]: d[1:] for d in case_clean}
print(case_dict)

[['Court System:', 'District Court For Charles County - Civil'], ['Location:', 'Charles'], ['Case Number:', 'D-042-LT-18-007202'], ['Title:', 'CHRISTY MASON vs. STEVE MCFARLANE, et al.'], ['Case Type:', 'Failure to Pay Rent'], ['Filing Date:', '08/26/2019'], ['Case Status:', 'Open'], ['Name:7', 'MASON, CHRISTY'], ['Address:8', '13535 WAVERLY RD'], ['City:9', 'NEWBURGState:MDZip Code:20664'], ['Name:10', 'MCFARLANE, STEVE'], ['Address:11', '2165 PINEVIEW CT'], ['City:12', 'WALDORFState:MDZip Code:20602'], ['Name:13', 'WASHINGTON, DEBBIE'], ['Address:14', '2165 PINEVIEW CT'], ['City:15', 'WALDORFState:MDZip Code:20602'], ['File Date:', '11/19/2018'], ['Filed By:', ''], ['Document Name:', 'Complaint / Petition - Landlord Tenant'], ['Comment:', ''], ['File Date:', '12/10/2018'], ['Filed By:', ''], ['Document Name:', 'Landlord Tenant Disposition'], ['Comment:', '']]
{'Court System:': ['District Court For Charles County - Civil'], 'Location:': ['Charles'], 'Case Number:': ['D-042-LT-18-00720

In [173]:
landlords.append(['MCFARLANE, STEVE'])

In [188]:
ll_contacts = {}
num_ll = 0
name_index = 0
list_keys = list(case_dict.keys())
for enum, key in enumerate(list_keys):
    if case_dict[key] in landlords:
        ll_contacts["LL"+str(num_ll)] = [case_dict[list_keys[enum]], case_dict[list_keys[enum+1]],case_dict[list_keys[enum+2]]]
            
print(ll_contacts)

{'LL0': [['MASON, CHRISTY'], ['13535 WAVERLY RD'], ['NEWBURGState:MDZip Code:20664']], 'LL1': [['MCFARLANE, STEVE'], ['2165 PINEVIEW CT'], ['WALDORFState:MDZip Code:20602']]}


In [251]:
ds  = pd.DataFrame([case_dict], columns=case_dict.keys())
case_data_df = pd.concat([case_data_df, ds], axis =0, sort=False)

In [252]:
ds

Unnamed: 0,Court System:,Location:,Case Number:,Title:,Case Type:,Filing Date:,Case Status:,Name:,Address:,City:,File Date:,Filed By:,Document Name:,Comment:
0,[District Court For Charles County - Civil],[Charles],[D-042-LT-19-002670],"[VINCENT BAILEY vs. BROOKE GRAY, et al.]",[Failure to Pay Rent],[08/05/2019],[Open],"[BAILEY, VINCENT]",[1628 PIN OAK DR],[WALDORFState:MDZip Code:20602],[08/05/2019],[],[Complaint / Petition - Landlord Tenant],[]


In [253]:
case_data_df

Unnamed: 0,Court System:,Location:,Case Number:,Title:,Case Type:,Filing Date:,Case Status:,Name:,Address:,City:,File Date:,Filed By:,Document Name:,Comment:
0,[District Court For Charles County - Civil],[Charles],[D-042-LT-18-007202],"[CHRISTY MASON vs. STEVE MCFARLANE, et al.]",[Failure to Pay Rent],[08/26/2019],[Open],"[WASHINGTON, DEBBIE]",[2165 PINEVIEW CT],[WALDORFState:MDZip Code:20602],[12/10/2018],[],[Landlord Tenant Disposition],[]
0,[District Court For Charles County - Civil],[Charles],[D-042-LT-18-007202],"[CHRISTY MASON vs. STEVE MCFARLANE, et al.]",[Failure to Pay Rent],[08/26/2019],[Open],"[WASHINGTON, DEBBIE]",[2165 PINEVIEW CT],[WALDORFState:MDZip Code:20602],[12/10/2018],[],[Landlord Tenant Disposition],[]
0,[District Court For Charles County - Civil],[Charles],[D-042-LT-19-002704],"[MD PROPERTY MANAGEMENT, LLC vs. BREANNA PARKE...",[Failure to Pay Rent],[08/06/2019],[Open],"[MD PROPERTY MANAGEMENT, LLC]",[14340 OLD MARLBORO PIKE],[UPPER MARLBOROState:MDZip Code:20772],[06/06/2019],[],[Petition - For Warrant of Restitution Filed],[]
0,[District Court For Charles County - Civil],[Charles],[D-042-LT-19-002670],"[VINCENT BAILEY vs. BROOKE GRAY, et al.]",[Failure to Pay Rent],[08/05/2019],[Open],"[BAILEY, VINCENT]",[1628 PIN OAK DR],[WALDORFState:MDZip Code:20602],[08/05/2019],[],[Complaint / Petition - Landlord Tenant],[]


Unnamed: 0,Court System:,Location:,Case Number:,Title:,Case Type:,Filing Date:,Case Status:,Name:,Address:,City:,File Date:,Filed By:,Document Name:,Comment:
0,District Court For Charles County - Civil,Charles,D-042-LT-18-007202,"CHRISTY MASON vs. STEVE MCFARLANE, et al.",Failure to Pay Rent,08/26/2019,Open,"WASHINGTON, DEBBIE",2165 PINEVIEW CT,WALDORFState:MDZip Code:20602,12/10/2018,,Landlord Tenant Disposition,


{'Location:': ['Charles'], 'Case Number:': ['D-042-LT-18-007202'], 'Title:': ['CHRISTY MASON vs. STEVE MCFARLANE, et al.'], 'Case Type:': ['Failure to Pay Rent'], 'Filing Date:': ['08/26/2019'], 'Case Status:': ['Open'], 'Name:': ['WASHINGTON, DEBBIE'], 'Address:': ['2165 PINEVIEW CT'], 'City:': ['WALDORFState:MDZip Code:20602'], 'File Date:': ['12/10/2018'], 'Filed By:': [''], 'Document Name:': ['Landlord Tenant Disposition'], 'Comment:': ['']}


In [106]:
d.keys()

dict_keys(['CaseSearch', 'Court System:', 'Location:', 'Case Number:', 'Title:', 'Case Type:', 'Filing Date:', 'Case Status:', 'Name:', 'Address:', 'City:', 'File Date:', 'Filed By:', 'Document Name:', 'Comment:'])

In [126]:
ouput = []
print(output)



['DEBBIE']


## Classify each row as a person (or business)

In [139]:
da.to_csv(os.path.join(save_dir,'for_Classification.csv'))

In [150]:
da = pd.read_csv(os.path.join(save_dir, 'classified1.csv'))

## Writing to order of click2mail template

In [179]:

output = []
# if is Person, parse First and Last name, else populate Organization
# if isPerson == 1:
    # First Name
output.append(d['Name:'][0].split(",")[1][1:])
# Last Name
output.append(d['Name:'][0].split(",")[0])
# else:
# Organization
output.append(d['Name:'][0])
# Address 1
output.append(d['Address:'][0])
# Address 2
# Address 3
# City
city = d['City:'][0].split('State:')[0]
output.append(city)
# State
state = d['City:'][0].split('State:')[1].split('Zip Code:')[0]
output.append(state)
# Zip
mail_zip = d['City:'][0].split('Zip Code:')[1]
output.append(mail_zip)
# Country_non-US
# N/A
print(output)


['DEBBIE', 'WASHINGTON', 'WASHINGTON, DEBBIE', '2165 PINEVIEW CT', 'WALDORF', 'MD', '20602']
