# Scraping data from MD Case Search

In [404]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time, datetime, os
from pathlib import Path
import string

## Create folder with today's date
today = datetime.date.today()  
todaystr = today.isoformat()   
dir_path = os.getcwd()
save_dir = dir_path + "/" + todaystr
if not os.path.exists(save_dir):
    os.mkdir(save_dir)


## Set the target save location and Firefox preferences to auto download csv files
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", save_dir)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")

## Create the Firefox session
driver = webdriver.Firefox(firefox_profile=profile)
## Get past the disclaimer splash screen
driver.get("http://casesearch.courts.state.md.us/casesearch/inquiry-index.jsp")
check = driver.find_element_by_name("disclaimer")
check.click()
continue_button = driver.find_element_by_name("action")
continue_button.click()

assert "No results found." not in driver.page_source



#### Remember to insert dates

In [409]:
## The search page won't let you search for all cases, need to do 26 searches (A-Z)
county_list = ['Charles', "Saint", 'Calvert']

for county in county_list:
#     for last_initial in string.ascii_lowercase:
    for last_initial in ['y']:#, 'c', 'd']:
        try:
            ## Enter search criteria
            last_name = driver.find_element_by_name("lastName")
            last_name.clear()
            last_name.send_keys(last_initial)
            driver.find_element_by_xpath("//select[@name='partyType']/option[text()='Plaintiff']").click()
            driver.find_element_by_xpath("//input[@name='site' and @value='CIVIL']").click()
            driver.find_element_by_xpath("//input[@name='courtSystem' and @value='D']").click()
            driver.find_element_by_xpath("//select[@name='countyName']/option[contains(text(), '%s')]" % county).click()
            last_name = driver.find_element_by_name("filingStart")
            last_name.clear()
            last_name.send_keys("6/1/2019")  ## Turn to variable
            last_name = driver.find_element_by_name("filingEnd")
            last_name.clear()
            last_name.send_keys("9/30/2019")  ## Turn to variable
            driver.find_element_by_xpath("//input[@name='action' and @value='Search']").click()

            element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.LINK_TEXT, "CSV"))).click() ## need to add logic for when X isn't found
            driver.back()
            time.sleep(6)
        except:
            print('No cases for plantiffs starting with ', last_initial)
    

# Concatenate csv exports to dataframe

In [410]:
os.chdir(save_dir)
os.getcwd()

'/Users/joshuahogge/Projects/tiredLandlords/2019-10-06'

In [414]:
import csv
import glob

# Adding a header to each file
path = save_dir
all_files = glob.glob(path + "/*.csv")

for filename in all_files:
    with open(filename, newline='') as f:
        r = csv.reader(f)
        data = [line for line in r]
    with open(filename,'w',newline='') as f:
        w = csv.writer(f)
        w.writerow(['Case Number', 'Name', 'Date of Birth', 'Party Type', 'Court', 
                        'Case Type', 'Case Status', 'Filing Date', 'Case Caption'])
        w.writerows(data)

In [415]:
import pandas as pd
# Concatenate csv to single dataframe
df = pd.concat([pd.read_csv(f, index_col=0) for f in all_files])
df.shape

(30, 8)

# Filtering down to the good stuff

In [416]:
df.shape

(30, 8)

In [417]:
da = df[df['Case Type'] == 'FTPR'] # Failure to Pay Rent
da.drop_duplicates(subset="Name", inplace=True)
da.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Name,Date of Birth,Party Type,Court,Case Type,Case Status,Filing Date,Case Caption
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
D042LT19002926,"Yasmine, Nuzhat",,Plaintiff,Charles County District Court,FTPR,Closed,08/07/2019,"NUZHAT YASMINE vs. VANESSA HARDEN, et al."
D041LT19000404,Yardlely Hills,,Plaintiff,Calvert County District Court,FTPR,Closed,06/05/2019,"YARDLELY HILLS vs. MONICA BROWN, et al."
D041LT19000492,Yardley Hills II,,Plaintiff,Calvert County District Court,FTPR,Closed,07/03/2019,"YARDLEY HILLS II vs. CALVIN BELL, Sr."
D041LT19000491,Yardley Hills I,,Plaintiff,Calvert County District Court,FTPR,Closed,07/03/2019,
D041LT19000493,Yardley Hills II,,Plaintiff,Calvert County District Court,FTPR,Closed,07/03/2019,
D041LT19000490,Yardley Hills Partnership I,,Plaintiff,Calvert County District Court,FTPR,Closed,07/03/2019,
D041LT19000531,Yardley Hills Partnership,,Plaintiff,Calvert County District Court,FTPR,Closed,07/10/2019,
D041LT19000397,Yardley Hills,,Plaintiff,Calvert County District Court,FTPR,Closed,06/05/2019,YARDLEY HILLS vs. SHERVON MACKALL


In [418]:
da.shape

(8, 8)

# Getting case data

In [429]:
case_data_df = pd.DataFrame()
ll_df = pd.DataFrame()

for index, row in da.iterrows():
    # Enter case data on query page
    court = row['Court'].split(' ')[0]
    driver.find_element_by_xpath("//select[@name='locationCode']/option[contains(text(), '%s')]" % court).click()
    case_number = driver.find_element_by_name("caseId")
    case_number.clear()
    case_number.send_keys(index) 
    try:
        driver.find_element_by_xpath("//input[@name='action' and @value='Get Case']").click()
    except:
        print('Case number %s not found' % str(da['Case Number'])) 
        
    # Selenium is bad at finding data in relation to headers, so this is Beautiful Soup finding all of
    # the landlord names, which will help pick out the landlor info after the table data is scraped in the next step.
    soup = BeautifulSoup(driver.page_source, 'html.parser')
#     print('clearing landlords')
    landlords = []
    for h5 in soup.find_all('h5'):
        if h5.text == 'Landlord / Plaintiff' or h5.text == "Landlord's Agent":
            landlords.append(h5.find_next('span', 'Value').contents)
#     print('landlord list')
#     print(landlords)
    
    # Mine data
    # This pulls all the info from the table on the page
    case_data = []
    for tr in driver.find_elements_by_xpath('//table//tr'):
        tds = tr.find_elements_by_tag_name('td')
        case_data.append([td.text for td in tds])

    case_clean = []
    # Cleaning out website header info
    [case_clean.append(a) for a in case_data if len(a) == 2]

    # Giving each item a unique suffix so writing to dictionary doesn't overwrite repeated info
    num_names = 0
    repeated_info = ['Name:', 'Address:', 'City:']
    for i in case_clean:
        if i[0] in repeated_info:
            i[0] = i[0] + str(num_names)
        num_names += 1

    # Casting do dict for import into pandas
    case_dict = {}
    case_dict = {d[0]: d[1:] for d in case_clean}
#     print(case_dict)

    # Grabbing landlord contact data     
    ll_contacts = {}
    num_ll = 0
    name_index = 0
    list_keys = list(case_dict.keys())
    for enum, key in enumerate(list_keys):
        if case_dict[key] in landlords:
            ll_contacts["LL"+str(num_ll)] = [case_dict[list_keys[enum]], case_dict[list_keys[enum+1]],case_dict[list_keys[enum+2]]]
            num_ll +=1    
#     print(ll_contacts)
    
    # Writing contacts to big landlord list
    dl  = pd.DataFrame.from_dict(ll_contacts, orient='index', columns=['name', 'address', 'city'])
#     print(dl)
    ll_df = pd.concat([ll_df, dl], axis=0, ignore_index=True, sort=False)
#     print(ll_df)
    
    driver.back()
    time.sleep(1)

                  name                   address  \
LL0    [MASOOD, AHSAN]  [2207 S SANCTUARY DRIVE]   
LL1  [YASMINE, NUZHAT]     [2207 S SANCTUARY DR]   

                                   city  
LL0  [NEW BERLINState:WIZip Code:53151]  
LL1  [NEW BERLINState:WIZip Code:53151]  
                name                        address  \
LL0  [WRENN, ALYSSA]  [700 YARDLELY DRIVE, SUITE A]   

                                         city  
LL0  [PRINCE FREDERICKState:MDZip Code:20678]  
                name                       address  \
LL0  [WRENN, ALYSSA]  [700 YARDLEY DRIVE, SUITE A]   

                                         city  
LL0  [PRINCE FREDERICKState:MDZip Code:20678]  
                name                       address  \
LL0  [WRENN, ALYSSA]  [700 YARDLEY DRIVE, SUITE A]   

                                         city  
LL0  [PRINCE FREDERICKState:MDZip Code:20678]  
                name                       address  \
LL0  [WRENN, ALYSSA]  [700 YARDLEY DRIVE, SUIT

## Classify each row as a person (or business)

In [430]:
ll_df.head()
ll_str = ll_df

ll_str['name'] = ll_str.name.apply(', '.join)
ll_str['address'] = ll_str.address.apply(', '.join)
ll_str['city'] = ll_str.city.apply(', '.join)
ll_str.drop_duplicates(subset="address", inplace=True)
ll_str


Unnamed: 0,name,address,city
0,"MASOOD, AHSAN",2207 S SANCTUARY DRIVE,NEW BERLINState:WIZip Code:53151
1,"YASMINE, NUZHAT",2207 S SANCTUARY DR,NEW BERLINState:WIZip Code:53151
2,"WRENN, ALYSSA","700 YARDLELY DRIVE, SUITE A",PRINCE FREDERICKState:MDZip Code:20678


In [431]:
ll_str.to_csv(os.path.join(save_dir,'for_Classification.csv'))


Open forClassification.csv, add a column 'isPerson', mark 1 for a person's name, 0 for a business name. Export to csv (with column names) as classified

In [434]:
dr = pd.read_csv(os.path.join(save_dir, 'classified.csv'), header=0, index_col=0)
dr.head()

Unnamed: 0,name,isPerson,address,city
0,"MASOOD, AHSAN",1,2207 S SANCTUARY DRIVE,NEW BERLINState:WIZip Code:53151
1,"YASMINE, NUZHAT",1,2207 S SANCTUARY DR,NEW BERLINState:WIZip Code:53151
2,"WRENN, ALYSSA",1,"700 YARDLELY DRIVE, SUITE A",PRINCE FREDERICKState:MDZip Code:20678


# Formatting and writing to Click2Mail format

In [435]:


def person_first_name(isPerson, name):
    if isPerson == 1:
        return name.split(",")[1][1:]
    else:
        return name
    
def person_last_name(isPerson, name):
    if isPerson == 1:
        return name.split(",")[0]
    else:
        return ''

    
dd = pd.DataFrame()
dd['First_name'] = dr.apply(lambda x: person_first_name(x['isPerson'], x['name']), axis=1) 
dd['Last_name'] = dr.apply(lambda x: person_last_name(x['isPerson'], x['name']), axis=1) 

                                              
# dd['Last_name']
dd['Organization'] = ''
dd['Address1'] = dr['address']
dd['Address2'] = ''
dd['Address3'] = ''
dd['City'] = dr['city'].str.split('State:', expand=True)[0]
dd['State'] = dr['city'].str.split('State:', expand=True)[1].str.split('Zip Code:', expand=True)[0]
dd['Zip'] = dr['city'].str.split('Zip Code:', expand=True)[1]
dd['Country_non-US'] = ''

dd.to_csv(os.path.join(save_dir, 'tiredLandlords.csv'))