Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
113 lines (84 sloc) 4.54 KB
#----- Regular expression to match a DOI
grep -P '^\s*<td align="left">10.\d{4,9}/[-._;()/:A-Z0-9]+</td>\s*$' file.html
#----- sed
The sed option r stands for extended regular expressions, and i means that the replacements are written to the original file. % replaces the default forward slash in the substitute command (easier because it doesn't interfere with the slash in the closing HTML element). Parentheses wrap matching strings into groups which can be back-referenced later on, so \1 back-references the 1st match, \2the 2nd, etc.
sed -ri 's%<head>%<head>\n\n\t<script type="text/javascript" src="https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js"></script>%g;s%(<td align="left">)(10\.[0-9]+/[^<]+)(</td>)%\1\2 <div class="altmetric-embed"></div>\3%g' file.html
#----- awk
Compared to sed, awk code looks more verbose because of double slashes.
awk '{r = gensub(/^(\s*<td>(10\.[0-9]{4,9}\/[^<]+)()/, "\\1\\2\\3 data-doi=\\2\\4\\2>\\5", "g"); print r;}' file.html
#----- XSLT
First remove DOCTYPE, add closing br, and add Altmetric JS
sed -i 's%<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">%%g ; s%<br>%<br></br>%g ; s%<head>%<head>\n\n\t<script type="text/javascript" src="https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js"></script>%g' file.html
Parse.xsl with an identity transformation template, and a matching template for tr.
<?xml version="1.0"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
version="2.0">
<xsl:output method="html" indent="yes" />
<!--
Which column number does the DOI have?
On the header row, first match table cells by the string value of the DOI column.
The position of this cell within the row is the number of all td's before this one plus 1.
-->
<xsl:variable name="doirow" as="xs:integer"
select="count(/html/body/table/tr[1]//td[starts-with(., 'Electronic version')]/preceding-sibling::td) + 1"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<!-- Match all but header rows -->
<xsl:template match="tr[position() >= 2]/td[$doirow]">
<xsl:variable name="doi" select="."/>
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
<xsl:element name="div">
<xsl:attribute name="class">
<xsl:value-of select="'altmetric-embed'"/>
</xsl:attribute>
<xsl:attribute name="data-badge-type">
<xsl:value-of select="'donut'"/>
</xsl:attribute>
<xsl:attribute name="data-doi">
<xsl:value-of select="$doi"/>
</xsl:attribute>
</xsl:element>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
Finally, transform with an XSLT engine
java -jar ~/saxonee/saxon9ee.jar file.html parse.xsl > file_transformed.html
#----- Javascript
When the code below is included into the head of the HTML page inside a script element (and the Altmetric Javascript link is there too) and the page is fully reloaded, all rows of the first table element are traversed one by one, and a new div element with the needed attributes is appended after every third cell, i.e. that one with the DOI.
document.addEventListener('DOMContentLoaded', (event) => {
var rows = document.getElementsByTagName("table")[0].rows;
var tds = null;
for (var i=0; i<rows.length; i++)
{
tds = rows[i].getElementsByTagName("td");
var addthis = tds[2].innerText;
var div = document.createElement("div");
div.setAttribute("class", "altmetric-embed");
div.setAttribute("data-badge-type", "donut");
div.setAttribute("data-doi", addthis);
tds[2].appendChild(div);
}
})
#------- Python
with open("file.html", "r") as fin:
with open("file_updated.html", "w+") as fout:
ln = 0
for line in fin:
ln += 1
if "<tr>" in line:
ln = 0
if '<td align="left">' in line and ln == 3:
r0 = line.split(">")
custombit = r0[1].split("<")[0]
lastsplit = line.split(',/td>')
lastbit = "></div></td>"
lastsplit[-1] = "<div class='altmetric-embed' data-badge-type='donut' data-doi=" + '"' + custombit + '"' + lastbit + "\n"
fout.write("<td>"+"".join(lastsplit))
else:
fout.write(line)
You can’t perform that action at this time.