In [1]:
import parsel
import re
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# https://docs.scrapy.org/en/xpath-tutorial/topics/xpath-tutorial.html
# https://stackoverflow.com/questions/52677769/extract-text-from-div-class-with-scrapy
# https://stackoverflow.com/questions/293482/how-do-i-fix-wrongly-nested-unclosed-html-tags/293558
# https://stackoverflow.com/questions/614797/xpath-find-a-node-that-has-a-given-attribute-whose-value-contains-a-string
# https://developer.mozilla.org/en-US/docs/Web/XPath/Introduction_to_using_XPath_in_JavaScript
# https://lxml.de/xpathxslt.html
# https://lxml.de/parsing.html
# https://stackoverflow.com/questions/25221023/select-a-node-with-xpath-whose-child-node-contains-a-specific-inner-text
# https://stackoverflow.com/questions/103325/what-is-the-correct-xpath-for-choosing-attributes-that-contain-foo
# https://cssselect.readthedocs.io/en/latest/
# https://www.w3schools.com/jsref/met_document_queryselector.asp

In [3]:
_STR_XPATH_LI_SPAN = '//li[span[contains(text(), " v ")]]'
_STR_CHILD_SPAN_WITH_V = "./span[contains(text(), ' v ')]"
_STR_HTML_NS = 'http://www.w3.org/TR/REC-html40'

In [4]:
with open("test_out.html", "rb") as fh:
    # A lenient HTML parser. Python package `html5lib` must be installed
    soup = BeautifulSoup(fh.read(), 'html5lib')

# Export a valid and prettified HTML document
html = soup.prettify()
doc = parsel.Selector(text=html)
first_elem = doc.xpath(_STR_XPATH_LI_SPAN)[0]

print(html[:600])

<html xmlns="http://www.w3.org/TR/REC-html40" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:o="urn:schemas-microsoft-com:office:office">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="OneNote.File" name="ProgId"/>
  <meta content="Microsoft OneNote 15" name="Generator"/>
  <link href="RemediesCribSheet.htm" id="Main-File" rel="Main-File"/>
  <link href="RemediesCribSheet_files/filelist.xml" rel="File-List"/>
 </head>
 <body lang="en-GB" style="font-family:Calibri;font-size:11.0pt">
  <div style="direction:ltr;border-width:100%">
   


In [5]:
def extract_text(elem):
    return re.sub(r'[\r\n\s]+', ' ', ' '.join([x for x in elem.xpath(".//text()").extract()]).strip())

def extract_spans_with_v(elem):
    return [extract_text(sub_elem) for sub_elem in elem.xpath(_STR_CHILD_SPAN_WITH_V)]

def extract_li_and_spans_with_v(top_elem):
    results = [(extract_spans_with_v(li), extract_text(li)) for li in top_elem.xpath(_STR_XPATH_LI_SPAN)]
    output = []
    [output.extend([(span, li) for span in spans_with_v]) for (spans_with_v, li) in results]
    return pd.DataFrame(data=output, columns=["case_name", "details"])

In [6]:
extract_text(first_elem)

"By signature L'Estrange v Graucob"

In [7]:
extract_spans_with_v(first_elem)

["L'Estrange v Graucob"]

In [8]:
extract_li_and_spans_with_v(doc)

Unnamed: 0,case_name,details
0,L'Estrange v Graucob,By signature L'Estrange v Graucob
1,Parker v South Eastern Railways,By reasonable notice Parker v South Eastern Ra...
2,McCutcheon v MacBrayne,Course of dealing McCutcheon v MacBrayne
3,Cavendish Square Holding v Makdessi,The courts would be guided by the test in the ...
4,Bunge SA v Nidera,"Damages should be compensatory, and when asses..."
5,Bunge SA v Nidera,Bunge SA v Nidera also confirmed the compensat...
6,Golden Strait Corporation v Nippon Yusen Kubis...,Golden Strait Corporation v Nippon Yusen Kubis...
7,Birse Constructions v Eastern Telegraph,Cost of cure is often used to calculate the ex...
8,McGlinn v Waltham Contractors,Cost of cure would not be awarded for defectiv...
9,Ruxley Construction v Forsyth,Cost of cure would not be awarded for defectiv...


In [9]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html
# Make sure `openpyxl` is installed
extract_li_and_spans_with_v(doc).to_excel("test_output.xlsx")