# Scraping data from MD Case Search

In [59]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time, datetime, os
from pathlib import Path
import string

## Create folder with today's date
today = datetime.date.today()  
todaystr = today.isoformat()   
dir_path = os.getcwd()
save_dir = dir_path + "/" + todaystr
if not os.path.exists(save_dir):
    os.mkdir(save_dir)


## Set the target save location and Firefox preferences to auto download csv files
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", save_dir)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")

## Create the Firefox session
driver = webdriver.Firefox(firefox_profile=profile)
## Get past the disclaimer splash screen
driver.get("http://casesearch.courts.state.md.us/casesearch/inquiry-index.jsp")
check = driver.find_element_by_name("disclaimer")
check.click()
continue_button = driver.find_element_by_name("action")
continue_button.click()

assert "No results found." not in driver.page_source



#### Remember to insert dates

In [102]:
## The search page won't let you search for all cases, need to do 26 searches (A-Z)
county_list = ['Charles County', "Saint Mary's County", 'Calvert County']

for county in county_list
    for last_initial in string.ascii_lowercase:
        try:
            ## Enter search criteria
            last_name = driver.find_element_by_name("lastName")
            last_name.clear()
            last_name.send_keys(last_initial)
            driver.find_element_by_xpath("//select[@name='partyType']/option[text()='Plaintiff']").click()
            driver.find_element_by_xpath("//input[@name='site' and @value='CIVIL']").click()
            driver.find_element_by_xpath("//input[@name='courtSystem' and @value='D']").click()
            driver.find_element_by_xpath("//select[@name='countyName']/option[text()=" + county + "]").click()
            last_name = driver.find_element_by_name("filingStart")
            last_name.clear()
            last_name.send_keys("6/1/2019")  ## Turn to variable
            last_name = driver.find_element_by_name("filingEnd")
            last_name.clear()
            last_name.send_keys("9/30/2019")  ## Turn to variable
            driver.find_element_by_xpath("//input[@name='action' and @value='Search']").click()

            element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.LINK_TEXT, "CSV"))).click() ## need to add logic for when X isn't found
            driver.back()
            time.sleep(15)
        except:
            print('No cases for plantiffs starting with ', last_initial)
    

TimeoutException: Message: 


# Concatenate csv exports to dataframe

In [6]:
os.chdir(save_dir)
os.getcwd()

'/Users/joshuahogge/Projects/tiredLandlords/2019-09-24'

In [195]:
import csv
import glob

# Adding a header to each file
path = r'/Users/joshuahogge/Projects/tiredLandlords/2019-09-24' # use your path
all_files = glob.glob(path + "/*.csv")

for filename in all_files:
    with open(filename, newline='') as f:
        r = csv.reader(f)
        data = [line for line in r]
    with open(filename,'w',newline='') as f:
        w = csv.writer(f)
        w.writerow(['Case Number', 'Name', 'Date of Birth', 'Party Type', 'Court', 
                        'Case Type', 'Case Status', 'Filing Date', 'Case Caption'])
        w.writerows(data)

In [196]:
import pandas as pd
# Concatenate csv to single dataframe
df = pd.concat([pd.read_csv(f, index_col=0) for f in all_files])
df.shape

(2151, 8)

# Filtering down to the good stuff

In [197]:
df.shape

(2151, 8)

In [271]:
da = df[df['Case Type'] == 'FTPR'] # Failure to Pay Rent
da.drop_duplicates(subset="Name", inplace=True)
da.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Name,Date of Birth,Party Type,Court,Case Type,Case Status,Filing Date,Case Caption
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
D042LT18007202,"Mason, Christy",,Plaintiff,Charles County District Court,FTPR,Open,08/26/2019,"CHRISTY MASON vs. STEVE MCFARLANE, et al."
D042LT19002704,"Md Property Management, Llc",,Plaintiff,Charles County District Court,FTPR,Open,08/06/2019,"MD PROPERTY MANAGEMENT, LLC vs. BREANNA PARKER..."
D042LT19002670,"Bailey, Vincent",,Plaintiff,Charles County District Court,FTPR,Open,08/05/2019,"VINCENT BAILEY vs. BROOKE GRAY, et al."
D042LT19002042,"Banks, Curtis",,Plaintiff,Charles County District Court,FTPR,Closed,08/02/2019,CURTIS BANKS vs. KENDALL BROOKS
D042LT19002626,"Beckham, Zachriell",,Plaintiff,Charles County District Court,FTPR,Closed,08/08/2019,"ZACHRIELL BECKHAM vs. STEPHANIE VAUGHN, et al."
D042LT19002674,Beeren & Barry Investments,,Plaintiff,Charles County District Court,FTPR,Closed,08/05/2019,BEEREN & BARRY INVESTMENTS vs. LADONNA WILLIAM...
D042LT19002940,"Bohrer, Mark",,Plaintiff,Charles County District Court,FTPR,Closed,08/08/2019,MARK BOHRER vs. AMY BROWN
D042LT19002042,"Newman, Lydia",,Plaintiff,Charles County District Court,FTPR,Closed,08/02/2019,CURTIS BANKS vs. KENDALL BROOKS
D042LT19004160,"Scott, Mary Coleman",,Plaintiff,Charles County District Court,FTPR,Closed,08/28/2019,"MARY SCOTT vs. ASHLEY KNIGHT, et al."
D042LT19001594,"Sheffield Greens A Partments, Llc",,Plaintiff,Charles County District Court,FTPR,Closed,07/30/2019,"SHEFFIELD GREENS A PARTMENTS, LLC vs. JELANI H..."


In [272]:
da.shape

(52, 8)

# Getting case data

In [285]:
case_data_df = pd.DataFrame()
ll_df = pd.DataFrame()

for index, row in da[:12].iterrows():
    # Enter case data on query page
    court = row['Court'].split(' ')[0]
    driver.find_element_by_xpath("//select[@name='locationCode']/option[contains(text(), '%s')]" % court).click()
    case_number = driver.find_element_by_name("caseId")
    case_number.clear()
    case_number.send_keys(index) 
    try:
        driver.find_element_by_xpath("//input[@name='action' and @value='Get Case']").click()
    except:
        print('Case number %s not found' % str(da['Case Number'])) 
        
    # Selenium is bad at finding data in relation to headers, so this is Beautiful Soup finding all of
    # the landlord names, which will help pick out the landlor info after the table data is scraped in the next step.
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    landlords = []
    for h5 in soup.find_all('h5'):
        if h5.text == 'Landlord / Plaintiff':
            landlords.append(h5.find_next('span', 'Value').contents)
#     print(landlords)
    
    # Mine data
    case_data = []
    for tr in driver.find_elements_by_xpath('//table//tr'):
        tds = tr.find_elements_by_tag_name('td')
        case_data.append([td.text for td in tds])

    case_clean = []
    # Cleaning out website header info
    [case_clean.append(a) for a in case_data if len(a) == 2]
    num_names = 0
    repeated_info = ['Name:', 'Address:', 'City:']
    
    # Giving each item a unique suffix so writing to dictionary doesn't overwrite repeated info
    for i in case_clean:
        if i[0] in repeated_info:
            i[0] = i[0] + str(num_names)
        num_names += 1

    # Casting do dict for import into pandas
    case_dict = {}
    case_dict = {d[0]: d[1:] for d in case_clean}
#     print(case_dict)

    # Grabbing landlord contact data     
    ll_contacts = {}
    num_ll = 0
    name_index = 0
    list_keys = list(case_dict.keys())
    for enum, key in enumerate(list_keys):
        if case_dict[key] in landlords:
            ll_contacts["LL"+str(num_ll)] = [case_dict[list_keys[enum]], case_dict[list_keys[enum+1]],case_dict[list_keys[enum+2]]]
            num_ll +=1    
#     print(ll_contacts)
    
    # Writing contacts to big landlord list
    dl  = pd.DataFrame.from_dict(ll_contacts, orient='index', columns=['name', 'address', 'city'])
    ll_df = pd.concat([ll_df, dl], axis=0, ignore_index=True, sort=False)
    
    
    driver.back()
    time.sleep(1)

## Classify each row as a person (or business)

In [362]:
ll_df.head()
# print(type(ll_df['name'][0][0]))
ll_str = ll_df

ll_str['name'] = ll_str.name.apply(', '.join)
ll_str['address'] = ll_str.address.apply(', '.join)
ll_str['city'] = ll_str.city.apply(', '.join)
ll_str.head()

Unnamed: 0,name,address,city
0,"MASON, CHRISTY",13535 WAVERLY RD,NEWBURGState:MDZip Code:20664
1,"MD PROPERTY MANAGEMENT, LLC",14340 OLD MARLBORO PIKE,UPPER MARLBOROState:MDZip Code:20772
2,"BAILEY, VINCENT",1628 PIN OAK DR,WALDORFState:MDZip Code:20602
3,"BANKS, CURTIS",513 CORAL KEY PL,NEWPORT NEWSState:VAZip Code:23606
4,"NEWMAN, LYDIA",10748 HOLLAWAY DR,UPPER MARLBOROState:MDZip Code:20772


In [363]:
ll_df.to_csv(os.path.join(save_dir,'for_Classification.csv'))


In [364]:
dr = pd.read_csv(os.path.join(save_dir, 'for_Classification.csv'), header=0, index_col=0)

In [365]:
dr.head()

Unnamed: 0,name,isPerson,address,city
0,"MASON, CHRISTY",1,13535 WAVERLY RD,NEWBURGState:MDZip Code:20664
1,"MD PROPERTY MANAGEMENT, LLC",0,14340 OLD MARLBORO PIKE,UPPER MARLBOROState:MDZip Code:20772
2,"BAILEY, VINCENT",1,1628 PIN OAK DR,WALDORFState:MDZip Code:20602
3,"BANKS, CURTIS",1,513 CORAL KEY PL,NEWPORT NEWSState:VAZip Code:23606
4,"NEWMAN, LYDIA",1,10748 HOLLAWAY DR,UPPER MARLBOROState:MDZip Code:20772


In [366]:
dr.to_dict('index')

{0: {'name': 'MASON, CHRISTY',
  'isPerson': 1,
  'address': '13535 WAVERLY RD',
  'city': 'NEWBURGState:MDZip Code:20664'},
 1: {'name': 'MD PROPERTY MANAGEMENT, LLC',
  'isPerson': 0,
  'address': '14340 OLD MARLBORO PIKE',
  'city': 'UPPER MARLBOROState:MDZip Code:20772'},
 2: {'name': 'BAILEY, VINCENT',
  'isPerson': 1,
  'address': '1628 PIN OAK DR',
  'city': 'WALDORFState:MDZip Code:20602'},
 3: {'name': 'BANKS, CURTIS',
  'isPerson': 1,
  'address': '513 CORAL KEY PL',
  'city': 'NEWPORT NEWSState:VAZip Code:23606'},
 4: {'name': 'NEWMAN, LYDIA',
  'isPerson': 1,
  'address': '10748 HOLLAWAY DR',
  'city': 'UPPER MARLBOROState:MDZip Code:20772'},
 5: {'name': 'BECKHAM, ZACHRIELL',
  'isPerson': 1,
  'address': '10727 ESPRET PL',
  'city': 'WHITE PLAINSState:MDZip Code:20695'},
 6: {'name': 'BEEREN & BARRY INVESTMENTS',
  'isPerson': 0,
  'address': '9900-E GREENBLET RD STE 199',
  'city': 'LANHAMState:MDZip Code:20706'},
 7: {'name': 'BOHRER, MARK',
  'isPerson': 1,
  'address'

## Writing to order of click2mail template

In [369]:

output = []
# if is Person, parse First and Last name, else populate Organization
if dr['isPerson'][0] == 1:
    # First Name
    output.append(dr['name'][0].split(",")[1][1:])
    # Last Name
    output.append(dr['name'][0].split(",")[0])
else:
    output.append(dr['name'][0])

# Address 1
output.append(dr['address'][0])
# Address 2
# Address 3
# City
city = dr['city'][0].split('State:')[0]
output.append(city)
# State
state = dr['city'][0].split('State:')[1].split('Zip Code:')[0]
output.append(state)
# Zip
mail_zip = dr['city'][0].split('Zip Code:')[1]
output.append(mail_zip)
# Country_non-US
# N/A
print(output)


['CHRISTY', 'MASON', '13535 WAVERLY RD', 'NEWBURG', 'MD', '20664']
