Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1374 lines (1039 sloc) 52.6 KB
#!/bin/bash
export LANG=C.UTF-8
# sed_sentence_chunker.sh
# Created: 2017-Jul-20 | Victoria Stuart | "mail"..@t.."VictoriasJourney.com"
# Last updated: 2017-Dec-30
# ----------------------------------------------------------------------------
# local: /mnt/Vancouver/Programming/scripts/sed_sentence_chunker/sed_sentence_chunker.sh
# GitHub: https://github.com/victoriastuart/biomedical-sentence-splitter
# ============================================================================
# USAGE:
# ======
# ./sed_sentence_chunker.sh
# bash sed_sentence_chunker.sh
# This script processes text files in the "input/" directory, and outputs to the
# "output/" directory.
# ============================================================================
# PYTHON SCRIPT USAGE:
# ====================
# To use this "sed_sentence_chunker.sh" bash script in a Python script; run
# this script in a directory that contains your text/input files in an "input/"
# directory. Note that you must also (manually) create an "output/" directory.
# ============================================================================
# APPROACH:
# =========
# 1. Preprocessing
# 2. Split sentences
# 3. Postprocessing
# ============================================================================
# TWO VARIATIONS OF THIS SCRIPT:
# ==============================
# If desired you can edit this script for alternative runtime options, as
# summarized here.
# ----------------------------------------------------------------------------
# SCRIPT VARIANT 1: specify input, output files on the command line.
# ------------------------------------------------------------------
# Usage:
# ./sed_sentence_chunker.sh <input_file> <output_file>
# bash sed_sentence_chunker.sh <input_file> <output_file>
# Example:
# ./sed_sentence_chunker.sh chunk_test_input.txt chunk_test_output.txt
# 1. Add these at/near top of script (note: cannot have spaces around " = " sign):
# input=$1
# output=$2
# 2. Comment out or delete this code section (after the Technical Notes, below):
# FILES=$(find input -type f -iname "*")
#
# for f in $FILES
# do
# sed -i -e 's/ffi/ffi/g
# s/fi/fi/g
# ... snip ...
# s/x/x/g' $f
# 3. Change "$f" in this line to "$input":
# sed 's/pp\.\s/Cho4Ph/g' $f > tmp_file
# 4. Near the bottom of the script, add these,
# sed 's/Dr,/Dr./g' tmp_file > $output
# rm tmp_file
# and delete these:
# sed -i 's/Dr,/Dr./g' tmp_file
# mv tmp_file output/$outname
# ----------------------------------------------------------------------------
# SCRIPT VARIANT 2: directly pass input text on the command line.
# ------------------------------------------------------------------
# Usage:
# . sed_sentence_chunker.sh <<< "quoted input text / sentences" ## << note: dot space command
# source sed_sentence_chunker.sh <<< "quoted input text / sentences" ## alternative (script sourcing)
# ----------------------------------------
# Examples:
# . sed_sentence_chunker.sh <<< "This is sentence 1. This is sentence 1."
# or:
# S="This is sentence 3. This is sentence 4."
# . sed_sentence_chunker.sh <<< $S
# 1. Add these at/near top of script (note: cannot have spaces around " = " sign):
# input=$1
# outfile="" ## output file
# OUTPUT="" ## output variable
# 2. Comment out or delete this code section (after the Technical Notes, below):
# FILES=$(find input -type f -iname "*")
#
# for f in $FILES
# do
# sed -i -e 's/ffi/ffi/g
# s/fi/fi/g
# ... snip ...
# s/x/x/g' $f
# 3. Change "$f" in this line to "$input":
# sed 's/pp\.\s/Cho4Ph/g' $f > tmp_file
# 4. Near the bottom of the script, add these,
# sed 's/Dr,/Dr./g' tmp_file > out_file
# OUTPUT=$(printf out_file)
# export $OUTPUT
# rm -f tmp*
# and delete these:
# sed -i 's/Dr,/Dr./g' tmp_file
# mv tmp_file output/$outname
# ============================================================================
# TECHNICAL NOTES:
# ================
# ----------------------------------------------------------------------------
# SCRIPT NAME ...:
# ----------------
# If the script name is too long for convenient use, just rename it; e.g.: ssc
# Run this script on my "chunk_test_input.txt" file to get an idea of it's
# capability (or to run your own unit tests).
# If needed you can use the Linux "pwgen" command to generate alphanumeric
# UID: "pwgen 8 2" will generate two (unique) 8-character alphanumeric strings.
# Example: $ pwgen 8 2 >> eej8Ae2p | air4Coo2
# ----------------------------------------------------------------------------
# FIRST SED COMMAND IN THIS SCRIPT:
# ---------------------------------
# After much (!) experimentation, it appears that the first sed command (below),
# outputting to the "tmp_file", MUST involve an "-r" argument (that in turn
# expects a regex expression). To achieve this, it is best to use the first
# command, as shown below. [Otherwise, you end up with blank output.]
# ----------------------------------------------------------------------------
# [a-zA-Z] vs. [A-Za-z] :
# -----------------------
# [a-zA-Z] **also** matches the ASCII characters between z and A: [ \ ] ^ _ `
# [A-Za-z] will only match the alphabet
# https://stackoverflow.com/questions/4923380/difference-between-regex-a-z-and-a-za-z
# http://www.asciitable.com/
# https://en.wikipedia.org/wiki/ASCII#/media/File:USASCII_code_chart.png
# ----------------------------------------------------------------------------
# REGEX EXPRESSIONS:
# ------------------
# I predominantly use two sed expressions -- the second, here, involving regex:
# sed -i 's/foo/bar/g'
# sed -i -r 's/foo\s?/bar/g' ## \s? : 0 or 1 (?) spaces (\s)
# . : any char, including newline (\n)_
# \. : period (literal period)
# -i : --in-place
# Regex "special" characters,
# [\^$.|?*+()
# have special meaning / function, and will thus need to be \-escaped.
# { and } are literal characters, unless they're part of a valid regular
# expression token such as a quantifier, e.g.: {3}.
# https://www.regular-expressions.info/refcharacters.html
# ----------------------------------------
# HERE IS MY (WORKING) EXPERIENCE RE: SED AND REGEX:
# In non-regex sed expressions, those special characters will need to be \-escaped
# to indicate that they are a regex special (not a literal) character.
# in regex (-r) sed expressions, they will be recognized as regex special
# characters, and will not have to be \-escaped.
# Exception: as noted, [ is a special character in regex -- denoting (e.g.) the
# start of a character class / set (https://www.regular-expressions.info/charclass.html).
# HOWEVER, unlike ?{}*^$ etc., in non-regex sed expressions, we need to escape
# it, \[ if we want to match a literal "[" in our expressions. [That applies,
# also, to regex (-r) sed expressions!].
# To match the start (^) or end ($) of a line. don't ever \-escape the ^ or $.
# To match the end of a line (EOL) ending (e.g.) with: ... the end.
#
# sed 's/the end\.$/the end.\n\Period./g' ## \. : literal period; $ EOL
# sed -r 's/the end\.$/the end.\n\Period./g' ## \. : literal period; $ EOL
# sed 's/the end.$/the end.\n\Period./g' ## . : any single characer; $ EOL
# sed -r 's/the end.$/the end.\n\Period./g' ## . : any single characer; $ EOL
# To match a literal $, anywhere in a line / sentence, \-escape the $ ( \$ ):
#
# sed 's/\$/\n/g'
# sed -r 's/\$/\n/g'
# Likewise (viz-a-viz: ^ $), there is no need to ever escape * if you intend it
# to match 0 or more of the preceding expression:
#
# sed 's/foo\s*bar/Foo.\n(Bar!)/g' ## matches 0 or more spaces between foo and bar
# sed -r 's/foo\s*bar/Foo.\n(Bar!)/g' ## ditto
#
# sed -r 's/foo\*bar/Foo.\n(Bar!)/g' ## matches foo*bar
# sed 's/foo*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more o)
# sed -r 's/foo*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more o)
# sed 's/foob*ar/Foo.\n(Bar!)/g' ## matches foobar (0 or more b)
# sed -r 's/foob*ar/Foo.\n(Bar!)/g' ## matches foobar (0 or more b)
# sed 's/fooz*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more z)
# sed -r 's/fooz*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more z)
#
# compare to:
#
# sed 's/foo?bar/Foo.\n(Bar!)/g' ## does NOT match foobar; MATCHES foo?bar (literal ?)
# sed 's/foo\?bar/Foo.\n(Bar!)/g' ## matches foobar (0 or 1 o); does not match foo?bar
# sed -r 's/foo?bar/Foo.\n(Bar!)/g' ## matches foobar (0 or 1 o); does not match foo?bar
# ----------------------------------------
# MORE EXAMPLES:
# model: sed 's/foo/bar/g'
# sed 's/foo\s\?bar/Foo.\nBar!/g'
# sed -r 's/foo\s?bar/Foo.\nBar!/g'
#
## 0 or 1 (?) spaces (\s)
## matches: foobar | foo bar
## does not match: foo bar | foo bar | ...
# sed 's/foo\s\{0,3\}bar/Foo.\nBar!/g'
# sed -r 's/foo\s{0,3}bar/Foo.\nBar!/g'
#
## {0,3} : 0, 1, 2 or 3 of preceding sequence (here: space, \s)
## matches: foobar | foo bar | foo bar | foo bar
## does not match: foo bar | foo bar | ...
# Regarding [ :
# sed 's/foo\s\?\[bar]/Foo.\n(Bar!)/g'
# sed -r 's/foo\s?\[bar]/Foo.\n(Bar!)/g'
#
## \[: match literal [
## matches: foo[bar] | foo [bar]
## does not match: foo [bar] | foo [bar] | foo [bar] | foo [bar] | ...
## does not match: foobar | foo bar | foo bar | ...
# sed 's/foo\s\?[bar]/Foo.\n(Bar!)/g'
# sed -r 's/foo\s?[bar]/Foo.\n(Bar!)/g'
#
## matches: foobar | foo bar
## replacing foo with Foo. and [bar] with (Bar!)ar
## (with a line break, \n, between them)!
## does not match: foo[bar] | foo [bar | ...
#
## Here, even in a non-regex sed expression, [bar] is being processed as a
## character class (like [A-Za-z0-9]), and so will match the b in foobar, but
## not the b in foo[bar]. To match the literal [ in that non-regex sed expression,
## \-escape the [, \[ , as shown further above / here:
# sed 's/foo\s\{0,3\}\[bar]/Foo.\n(Bar!)/g'
# sed -r 's/foo\s{0,3}\[bar]/Foo.\n(Bar!)/g'
#
## matches: foo[bar] | foo [bar] | foo [bar] | foo [bar]
## does not match: foo [bar] | foo [bar] | ...
## does not match: foobar | foo bar | foo bar | ...
# ----------------------------------------
# sed -r 's/\.([A-Z])\.$/.\1Shah7a/g'
#
## \. : literal period; ([A-Z]) : ASCII capitals in character class ();
## $ : end of line, non-escaped; . : period (do not need to escape in
## replace portion of the sed expression; \1 : replace with captured
## characters (class); Shah7a : an alphanumeric "tag" / substitution /
## UID (that I will replace later with the text it represents: .)
# sed -r 's/([[({\s])pp\.\s?([ivx0-9])/\1Cho4Ph\2/g' $f > tmp_file
#
## NOTE: that "[" MUST appear FIRST in the "[...]" character expression);
## i.e., [[...]. Also, if used, escape ] (i.e., \]). Lastly, as this is
## a -r regex expression, the ? is not \?-escaped; ...
# ----------------------------------------
# SED REGEX SUMMARY:
# ==================
# 1. No need to \-escape: ^ (start of line)
# $ (EOL)
# [] (character class / set) ## sed 's/foo[b]ar/foo\nbar/g'
# * (0 or more instances of matches for preceding expression)
#
# in: sed 's///g'
# or: sed -r 's///g'
# 2. \-escape: ? (0 or 1 of preceding expression) ## \?
# * (0 or more of preceding expression) ## \*
# { and } in {i,j} expressions ## \{0,3\}
#
# in: sed 's///g'
# not in: sed -r 's///g'
# ----------------------------------------
# In the script below, I tried to minimize the use of "lookaheads" () in
# my sed ( -r ) expressions, as I found these to increase the runtime.
# That is, where possible / practical, I tended to prefer the simpler
# sed -i 's///g' expressions.
# Expressions of the sort .{1,15}\.s\s* look complicated, but they are pretty
# simple! Basically it says: match any character ( . ), appearing 1-15
# times ( {1,15}, that is followed by a period ( \.) and any space ( \s\s* ) ...
# Likewise: ^[A-Z].{1,5}\. says match any 1..5 preceding characters that are
# not capitals, followed by a period ...
#
# sed -i -r "s/[.](.[^0-9]{1,15})[.]/Shah7a\1./g" tmp_file
#
# likewise translates to: match, in place, a period [.] that is followed by
# any span of 1-15 characters {1,15}, that are not 0 through 9 [^0-9],
# followed by another period [.]. All of that is this bit: .[^0-9]{1,15})[.]
#
# The second ("replace")_half of that regex expression states: replace replace
# THOSE periods (matched as described) with the unique alphanumeric string,
# Shah7a, followed by a period.
#
# sed -i -r "s/.[^.]\{1,15\}.\s\s*/\n\n/g" tmp_file
#
# Match any character ( . ), appearing 1-15 times ( {1,15} that is NOT a
# period ( !. ), but is followed by a period ( \.) and any space ( \s\s* ),
# and split ( \n\n ) at that position.
# https://www.gnu.org/software/sed/manual/html_node/Regular-Expressions.html
# http://www.rexegg.com/regex-quickstart.html
# ----------------------------------------------------------------------------
# ABBREVIATIONS -- JOURNAL TITLES; AUTHORS ...
# --------------------------------------------
# Journal author name initials and journal title abbreviations are a huge
# programmatic, i.e. technical difficulty. While my approach, below, minimizes
# the disruptions of those viz-a-viz bone fide sentence chunking, some issues
# will inevitably remain. E.g., some very short sentences may not get split
# from the others. C'est la vie!
# ----------------------------------------------------------------------------
# THESE COMMENTS:
# ---------------
# I deleted all of these comments from this script, leaving only the commands.
# The runtimes (time ./sed_sentence_chunker.sh) were essentially identical.
# ----------------------------------------------------------------------------
# OLDER NOTES / REFERENCE ...
# ---------------------------
# These notes are no longer relevant viz-a-viz this script, but are useful
# re: my earlier versions -- and general knowledge (preserved here!).
# ----------------------------------------
# ESCAPING SINGLE QUOTES WITHIN SINGLE-QUOTED EXPRESSIONS:
# --------------------------------------------------------
# To escape a single quote within a single-quoted sed expression, you need to
# terminate / chain the single quotes. E.g., to escape an internal ', terminate
# the sed single-quoted expression with another (internal) ', then escape the
# internal single quote inside the sed expression: "'", then add back (chain)
# another single quote ' to "continue / chain" the sed expression. Similarly,
# to escape (e.g.) a bracket [ ] inside the optional match [] pattern within a
# sed expression, chain the sed command, quoting the bracket term: ['"]"'] ...
# https://stackoverflow.com/questions/18370536/sed-or-operator-in-set-of-regex
# https://stackoverflow.com/questions/14813145/boolean-or-in-sed-regex
# https://serverfault.com/questions/466118/using-sed-to-remove-both-an-opening-and-closing-square-bracket-around-a-string
# ... all members of a character class lose special meaning (with a few
# exceptions). And ] loses its meaning if it is placed first.
# That observation is important re: the "([])}])" pattern below (that searches
# for characters ")", "}" and ")"). You MUST list the "]" closing bracket
# (within the "([ ])" character class), with the "]" square bracket listed FIRST:
# "([])}])".
# The following should capture all permutations of two contiguous sentences,
# where the inter-sentence boundary may contain any permutation of terminal
# punctuation (".", "!", "?"), parentheses and brackets ("(", "{, "[", ")", "}",
# "]", and any combination of quotation marks -- and split those sentences!
# sed -i -r 's/([A-Z]\.)\s\s*([A-Z])/\1\n\n\2/g' tmp_file
# To "follow" these, focus on the second part (after the \n break):
# '"'"' = escaped single quotation, used internally in single-quoted sed expression
# Since multiple spaces were converted (above) to single spaces, sentences will
# be separated by 0 or 1 spaces. Hence, the ".?" expression, below, will match
# 0 or 1 characters, between the two parts of these sed regex expressions
# [sentences will be split (\n) at those places].
# Replace -- again -- multiple spaces with single space:
# sed -i 's/ */ /g' tmp_file
# ----------------------------------------
# MORE REGEX EXAMPLES -- QUOTATION MARKS AND BRACKETS:
# ----------------------------------------------------
# bn="ant bat, cat; dog; (eel), [fish]: 'horse - jackal \"kangaroo\" {lemur} / moose | possum \ quail"
# echo $bn
# ant bat, cat; dog; (eel), [fish]: 'horse - jackal "kangaroo" {lemur} / moose | possum \ quail
# echo $bn; echo $bn | sed 's/[][(){} -,;:\x27"\|/]/./g' ## \x27 : single quote
# ant bat, cat; dog; (eel), [fish]: 'horse - jackal "kangaroo" {lemur} / moose | possum \ quail
# ant.bat..cat..dog...eel....fish....horse...jackal..kangaroo...lemur....moose...possum...quail
# echo $bn; echo $bn | sed 's/[][(){} -,;:\x27"\|/]/./g ; s/\.\{1,\}/./g' ## \x27 : single quote
# ant bat, cat; dog; (eel), [fish]: 'horse - jackal "kangaroo" {lemur} / moose | possum \ quail
# ant.bat.cat.dog.eel.fish.horse.jackal.kangaroo.lemur.moose.possum.quail
# NOTES:
# * not a regex (-r) sed expression, so need to escape the {} in {1,} --> \{1,\}
# * to easily escape a single quote ' in a 'single-quoted string', substitute it with: \x27
# * to include literal [] brackets inside a [] character class, they must appear in this order immediately after the leading (character class) [:
# [][...]
# echo 'donkey [horse]' | sed 's/[[]//g'
# donkey horse]
# echo 'donkey [horse]' | sed 's/[]]//g'
# donkey [horse
# echo 'donkey [horse]' | sed 's/[[]]//g' ## << does not work! [[]] ...
# donkey [horse]
# echo 'donkey [horse]' | sed 's/[][]//g' ## << ... use THIS! [][]
# donkey horse
# ----------------------------------------
# UPDATED [2017-11-24]:
# ---------------------
# With my substitution of ' " ( ) [ ] { } I no longer have to worry about
# those when splitting sentences -- this HUGELY simplifies things!! :-D
# [E.g., look at the "main processing loops" in my older
# "sed_sentence_chunker{1|2|3}.sh" scripts!]
# As well, I took the approach that since they will not be especially relevant
# for my BioNLP work, tokenized sentences, etc. of deleting all double quotation
# marks: ". As well, I delete all single quotes around sentences (keeping
# internal single quotes / apostrophes, with the exception that I expand most
# common contractions; e.g. it's --> it is ...). This (also) greatly simplifies
# the processing, i.e. sentence chunking / splitting! :-D
# ============================================================================
# ============================================================================
# PRELIMINARIES:
# ==============
# https://stackoverflow.com/questions/4638874/how-to-loop-through-a-directory-recursively-to-delete-files-with-certain-extensi
# FILES=$(find ./input-z -type f -iname "*")
# ... As a number of people have commented, this will fail if there are spaces in filenames.
# You can work around this by temporarily setting the IFS (internal field separator) to the newline character. ...
IFS=$'\n'; set -f
# be sure to include "unset IFS; set +f" when done, near the bottom of the script
FILES=$(find ./input -type f -iname "*") ## ALL files, recursively
# can also use this, in for loop a few lines below:
# for f in $(find ./input-z -type f -iname "*")
# echo '------------------------------------------------------------------------------'
# echo '$FILES:' ## single-quoted, prints: $FILES:
# echo "$FILES" ## double-quoted, prints path/, filename (one per line)
# echo '------------------------------------------------------------------------------'
for f in $FILES
do
cp "$f" "tmp_file" ## work on a copy so that input file $f is not modified
# ----------------------------------------------------------------------
# Preprocessing step -- replace various annoyances (different types of quotation marks; ligatures; ...):
# https://stackoverflow.com/questions/26568952/how-to-replace-multiple-patterns-at-once-with-sed
# https://stackoverflow.com/questions/24509214/how-to-escape-single-quote-in-sed
# Escape ' within single-quoted sed '...' expressions by substituting those ' with \x27; e.g.:
# s/'/'/g --> s/'/\x27/g
sed -i -e 's/ffi/ffi/g
s/fi/fi/g
s/ff/ff/g
s/fl/fl/g
s/ffl/ffl/g
s/…/.../g
s/�/μ/g
s/␮/μ/g
s/௡/®/g
s/␣/α/g
s/␤/β/g
s/␦/δ/g
s/5Ј-/5\x27-/g
s/-3Ј/-3\x27/g
s/þ/+/g
s/¼/=/g
s/ϭ/=/g
s/Ɛ/=/g
s/Ͻ/</g
s/Ͼ/>/g
s/␥/γ/g
s/␧/ε/g
s/␨/ζ/g
s/Ϫ/-/g
s/À/-/g
s/# OLD:/=/g
s/ ‫؍‬ ./=/g
s/␹/X/g
s/Ն/≥/g
s/Ն/≤/g
s/Յ/+/g
s/Ã/*/g
s/Â/x/g
s/¥/x/g
s///g
s/™//g
s/®//g
s/→/>/g
s/–/-/g
s/Ϯ/±/g
s/؉/+/g
s/ϫ/x/g
s/ϳ/~/g
s/ʽ/\x27/g
s/ʻ/\x27/g
s/“/"/g
s/ˮ/"/g
s/”/"/g
s/״/"/g
s/ʺ/"/g
s/′′/"/g
s/〃/"/g
s/’/\x27/g
s/ʼ/\x27/g
s/‘/\x27/g
s/′/\x27/g
s/`/\x27/g
s/׳/\x27/g
s/ʹ/\x27/g
s/ꞌ/\x27/g
s/ˊ/\x27/g
s/ˋ/\x27/g
s/ˌ/\x27/g
s/—/-/g
s/؊/-/g
s/ϩ/+/g
s/ϫ/x/g' tmp_file
# ============================================================================
# SPECIAL CASES -- COMMON ABBREVIATIONS:
# --------------------------------------
# ----------------------------------------
# PAGE NUMBER ABBREVIATIONS:
# Approach: substitute a unique alphanumeric string for "pp." (we will restore
# it later). Generated via the Linux command: pwgen 6 1
# Page number abbreviation "pp.", followed by a space; unlikely to appear'
# at EOL, so we can do a simple substitution:
sed -i 's/pp\.\s/Cho4Ph/g' tmp_file
# [ in character expression [] must appear first: [[]; -r regex, therefore
# [I will process the "p." abbreviation after I strip the document of
# extraneous whitespace.]
# ============================================================================
# REMOVE URLs
# Here is the approach that I used to remove URLs, etc. from my files
# not sed -r .... therefore \-escape the ? :
# sed -i -e 's/http[s]\?:\/\/\S*//g ; s/www\.\S*//g ; s/ftp:\S*//g ; s/doi:\S*//g' tmp_file
sed -i -e 's/http[s]\?:\/\/\S*//g ; s/www\.\S*//g ; s/ftp:\S*//g ; s/[dD][oO][iI]:\s\?\S*//g' tmp_file
# However, that expression leaves "blank" lines, that this perl expression removes:
perl -i -pe 's/^'`echo "\012"`'${2,}//g' tmp_file ## 012 is the octal form of \n
# Posted to / explained at:
# https://stackoverflow.com/questions/4283344/sed-to-remove-urls-from-a-file/47821796#47821796
# ... includes an alternative to using "branch labels" to deal with newlines, \n, with sed ...
# ============================================================================
# REMOVE (SOME) REFERENCES:
perl -i -pe 's/^Reference:.*$//g;s/^Ref:.*$//g;s/^Citation:.*$//g; s/^'`echo "\012"`'${2,}//g' tmp_file
# The last bit removes the non-printing newlines (\n) that are left behind.
# Test:
#
# Ongoing work in the Black lab seeks to uncover biomarkers of response and toxicity to new immunotherapeutic agents used in the fight against lung cancer.
# Reference: Madeline Krentz Gober, James P. Collard, Katherine Thompson, Esther P. Black.A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC.
# Ref: Madeline Krentz Gober, James P. Collard, Katherine Thompson, Esther P. Black.A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC.
# Citation: Madeline Krentz Gober, James P. Collard, Katherine Thompson, Esther P. Black.A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC.
# Our previous work identified a 13-gene miRNA signature predictive of response to the epidermal growth factor receptor (EGFR) inhibitor, erlotinib, in Non-Small Cell Lung Cancer cell lines.
#
# perl -pe 's/^Reference:.*$//g;s/^Ref:.*$//g;s/^Citation:.*$//g; s/^'`echo "\012"`'${2,}//g' <that text>
# ============================================================================
# WHITESPACE, TABS:
# Remove leading, trailing whitespace and multiple spaces from sentences:
# https://www.cyberciti.biz/tips/delete-leading-spaces-from-front-of-each-word.html
sed -i 's/^[ \t]*//; s/[ \t]*$//' tmp_file ## two (chained) sed expressions
# Replace multiple spaces with single space:
sed -i 's/ */ /g' tmp_file
# ============================================================================
# REMAINING PAGE NUMBER ABBREVIATIONS:
# The page number abbreviation "p." is more complicated than "pp.". We
# needed to process "pp." (above) BEFORE "p.", otherwise substitution
# of the "p." in "pp." will incorrectly get substituted with "Cho4Ph".
sed -i -r 's/([[({\s])p\.\s?([ivx0-9])/\1Eiph2T\2/g' tmp_file
# [ in character class [] must appear first: [[...]
# ============================================================================
# BIOCHEMICAL TEXT -- AMINO ACIDS:
# Need to do these before processing periods, as (e.g.) the p. ("protein")
# in p.Arg62His (an amino acid substitution / variant) will be processed
# as an abbreviation, and/or split into a sentence at that period ...
sed -i 's/p.Ala/HieN7uuP/g' tmp_file ## Ala Alanine (A)
sed -i 's/p.Arg/Nae0RaeZ/g' tmp_file ## Arg Arginine (R)
sed -i 's/p.Asn/see7AuK6/g' tmp_file ## Asn Asparagine (N)
sed -i 's/p.Asp/chaeJeu1/g' tmp_file ## Asp Aspartic Acid (D)
sed -i 's/p.Cys/EiV6Gaix/g' tmp_file ## Cys Cysteine (C)
sed -i 's/p.Gln/Ufaiph2b/g' tmp_file ## Gln Glutamine (Q)
sed -i 's/p.Glu/Goh8eish/g' tmp_file ## Glu Glutamic Acid (E)
sed -i 's/p.Gly/xei1Phei/g' tmp_file ## Gly Glycine (G)
sed -i 's/p.His/aak0eVei/g' tmp_file ## His Histidine (H)
sed -i 's/p.Ile/vai9aeS3/g' tmp_file ## Ile Isoleucine (I)
sed -i 's/p.Leu/ohzah5Ei/g' tmp_file ## Leu Leucine (L)
sed -i 's/p.Lys/Oa4Aequo/g' tmp_file ## Lys Lysine (K)
sed -i 's/p.Met/TheeWie7/g' tmp_file ## Met Methionine (M)
sed -i 's/p.Phe/ohNa9pe0/g' tmp_file ## Phe Phenylalanine (F)
sed -i 's/p.Pro/Eetaib7k/g' tmp_file ## Pro Proline (P)
sed -i 's/p.Trp/ga3yeeGh/g' tmp_file ## Trp Tryptophan (W)
sed -i 's/p.Tyr/DuY2Gub7/g' tmp_file ## Tyr Tyrosine (Y)
sed -i 's/p.Ser/oezoo9Ca/g' tmp_file ## Ser Serine (S)
sed -i 's/p.Thr/wahRoo7E/g' tmp_file ## Thr Threonine (T)
sed -i 's/p.Val/ieKai4oo/g' tmp_file ## Val Valine (V)
# ----------------------------------------------------------------------------
# GENOMIC VARIANTS:
# ... a letter prefix should be used to indicate the type of reference sequence used.
# Accepted prefixes are;
# "g." for a genomic reference sequence
# "c." for a coding DNA reference sequence
# "n." for a non-coding DNA reference sequence
# "r." for an RNA reference sequence (transcript)
# "p." for a protein reference sequence
# ============================================================================
# PERIODS:
# To better deal with the many complications associated with periods,
# first delete all spaces preceding and proceeding periods. This will
# take care of, e.g.: U. S. A. | The end . | V. A. Stuart |
# J. Am. Soc. Chem. ...
sed -i 's/\s*\././g' tmp_file
sed -i 's/\.\s*/./g' tmp_file
# ----------------------------------------
# Ellipses (ellipsis: ...) -- convert 3 or more periods (.) to an ellipsis:
sed -i 's/\.\{3,\}/.../g' tmp_file
# .. then store those ellipses as a UID:
sed -i 's/\.\.\./Iet1auki/g' tmp_file
# ... and finally convert remaining tandem periods (..) to a single period:
sed -i 's/\.\././g' tmp_file
# ----------------------------------------
# version (v.) abbreviation (v. + 0 or 1 character + any number):
sed -i -r 's/v\.\s?([0-9])/Eegh5eel\1/g' tmp_file
# ----------------------------------------
# versus (vs.) abbreviation:
sed -i 's/vs\./Air5ah/g' tmp_file
# ----------------------------------------
# "E.g.", "e.g.", "I.e." or "i.e.":
sed -i 's/[eE]\.g\./Va1Eed/g' tmp_file
sed -i 's/[iI]\.e\./Uchee4/g' tmp_file
# ----------------------------------------
# "cc.", "CC." or "cf.":
# This also captures "Hcc" (hepatocellular carcinoma) at the end of a sentence: Hcc.
# sed -i 's/[cC]\.\?[cC]\./Ri9Ohk/g' tmp_file
# Here is a workaround:
sed -i 's/[^Hh][cC]\.\?[cC]\./Ri9Ohk/g' tmp_file
# " cc " or " CC ":
sed -i 's/\s[cC][cC]\s/ Ri9Ohk /g' tmp_file
sed -i 's/c\.\?f\./Tig8shei/g' tmp_file
sed -i 's/\scf\s/ Tig8shei /g' tmp_file
# ----------------------------------------
# "et al." abbreviation (will restore, with period, later):
sed -i 's/et al\./et al/g' tmp_file
# ----------------------------------------
# "Fig.", "fig.", "Figs.", "figs.":
# As I don't otherwise process commas, I can simply use them as a facile
# substitution for periods (later swapping , for . in post-processing):
sed -i -r 's/([fF]ig[s])\./\1,/g' tmp_file
# ----------------------------------------
# Personal titles (again, temporarily replace '.' with ','):
sed -i 's/Dr\./Dr,/g' tmp_file
sed -i 's/Drs\./Drs,/g' tmp_file
sed -i 's/Mr\./Mr,/g' tmp_file
sed -i 's/Mrs\./Mrs,/g' tmp_file
sed -i 's/Ms\./Ms,/g' tmp_file
sed -i 's/St\./St,/g' tmp_file
# ============================================================================
# OTHER BIOCHEMICAL TEXT:
# ----------------------------------------
# SINGLE QUOTATIONS:
# Note that some single quotes (i.e. apostrophes), e.g., 5'-, 3'-, ...
# are important in biochemistry / chemistry. To be safe, we'll proactively
# capture / protect these:
sed -i "s/3'/tho6Si2o/g" tmp_file ## e.g.: 3'-end
sed -i "s/5'/oochie8P/g" tmp_file ## e.g.: 5'-ATGGCTCGATCTTA...
sed -i "s/A's/ohph5AN6/g" tmp_file ## e.g.: (multiple adenines) multiple A's precede
sed -i "s/C's/Ji4oopow/g" tmp_file ## e.g.: (multiple adenines) multiple C's precede
sed -i "s/G's/Aeyahk4A/g" tmp_file ## e.g.: (multiple adenines) multiple G's precede
sed -i "s/T's/oogeel3W/g" tmp_file ## e.g.: (multiple adenines) multiple T's precede
# ----------------------------------------
# BIOCHEMICAL, CHEMICAL PRIMES:
sed -i "s/1'/hooPhil4/g" tmp_file
sed -i "s/2'/He5EiS1Z/g" tmp_file
sed -i "s/3'/IeghuP3V/g" tmp_file
sed -i "s/4'/Loh4aeri/g" tmp_file
sed -i "s/5'/Aht9Vohs/g" tmp_file
sed -i "s/6'/ReiR5zee/g" tmp_file
sed -i "s/7'/eiTei4ri/g" tmp_file
sed -i "s/8'/ay0ePicu/g" tmp_file
sed -i "s/9'/seeHush2/g" tmp_file
# ============================================================================
# REMAINING SINGLE, DOUBLE QUOTATIONS:
# Delete all double quotations: not particularly needed in NLP, e.g. tokenized text:
sed -i 's/"//g' tmp_file
# ----------------------------------------------------------------------------
# CONTRACTIONS:
# Deal with common contractions, before dealing with single quotes / apostrophes.
# ----------------------------------------
# First, expand common contractions:
sed -i -r "s/([a-z])'d/\1 did/g" tmp_file ## otherwise, 'd* becomes did*
sed -i -r "s/([a-z])'m/\1 am/g" tmp_file ## otherwise, 'm* becomes am*; e.g. to 'mess' with >> to amess' with
sed -i "s/won't/will not/g" tmp_file ## do this rule before the following rule
sed -i "s/n't/ not/g" tmp_file ## isn't | shouldn't | wouldn't | wouldn't | ...
sed -i "s/'ll/ will/g" tmp_file
sed -i "s/'re/ are/g" tmp_file
sed -i "s/'ve/ have/g" tmp_file
sed -i "s/here's/here is/g" tmp_file ## here's | Here's | there's | There's | where's | Where's ...
sed -i "s/I'd/I would/g" tmp_file
sed -i "s/It's/It is/g" tmp_file
sed -i "s/\sit's/ it is/g" tmp_file
sed -i "s/That's/That is/g" tmp_file
sed -i "s/that's/that is/g" tmp_file
sed -i "s/What's/What is/g" tmp_file
sed -i "s/\swhat's/ what is/g" tmp_file
# ----------------------------------------
# Next, substitute remaining contractions with UID (restore in post-processing):
sed -i -r "s/([a-zI])'d/\1chaSaib7/g" tmp_file ## e.g.: I'd | how'd | who'd | why'd | ...
sed -i -r "s/([a-zI])'ll/\1UivahJ5e/g" tmp_file ## e.g.: I'll
sed -i -r "s/([a-zI])'m/\1chahei1O/g" tmp_file ## e.g.: I'm
sed -i -r "s/([a-z])'t/\1Zeep7Auy/g" tmp_file
sed -i -r "s/([a-z])'nt/\1Zeep7Auy/g" tmp_file ## e.g.: is'nt [grammatical (spelling) error]
sed -i -r "s/([a-z])'re/\1Phoh5eil/g" tmp_file ## e.g.: you're | We're responsible ...
# ------------------
sed -i "s/'six/eKu6eech/g" tmp_file ## e.g.: escape 'six
sed -i "s/'seven/pahl8Avu/g" tmp_file ## e.g.: escape 'seven
sed -i -r "s/([a-z])'s/\1zaoGii5p/g" tmp_file ## e.g.: there's | various possessives: Victoria's | women's | ...
# ------------------
# UPDATE: the following expression left (when apostrophes restored) artefacts like this:
# 'mess'[orig text] >> [processing: this script] >> mess' [output]:
#
# sed -i -r "s/([a-z])'\s/\1ueKek3oh/g" tmp_file ## e.g.: plural noun possessives ending in "s": girls' dresses | Wilsons' house | ...
#
# It is not needed, with the inclusion of the "final" rule, below: sed -i "s/'//g" tmp_file
# ------------------
sed -i -r "s/([a-z])'t/\1iCuRahb6/g" tmp_file ## e.g.: isn't
sed -i -r "s/([a-zI])'ve/\1Roopes5f/g" tmp_file ## e.g.: I've' | (+)'ve
# less common / archaic:
sed -i "s/ma'am/Quei2Eex/g" tmp_file
sed -i "s/ne'er/IeDae7Lu/g" tmp_file ## e.g.: ne'er-do-well
sed -i -r "s/o'([a-z])/Xahc3Iel\1/g" tmp_file ## e.g.: o'clock
sed -i "s/'twas/uph4aida/g" tmp_file ## e.g. 'twas the night; escapes: 'two | 'twenty ...
# Finally, delete all remaining single quotations, apostrophes:
sed -i "s/'//g" tmp_file
# WITH THE EXPRESSION ABOVE, THIS SHOULD **NOT** BE NEEDED:
# Delete single quotations, apostrophes at end of words:
# sed -i "s/'\s/ /g" tmp_file ## e.g.: missed' that
# sed -i "s/'\././g" tmp_file ## e.g.: missed.' That
# sed -i "s/\.'/./g" tmp_file ## e.g.: missed'. That
# ============================================================================
# PREPROCESSING MISCELLANY:
# ----------------------------------------
# Delete tandem commas, semicolons:
sed -i 's/,,/,/g' tmp_file
sed -i 's/;;/;/g' tmp_file
# ----------------------------------------
# Clean up improperly-terminated sentences (e.g. ?!!?!?!??!):
# ------------------
# Tandem question, exclamation marks:
for i in {1..8}
do
sed -i 's/??/?/g' tmp_file ## not regex (-r), so those those are
sed -i 's/!!/!/g' tmp_file ## literal ? ! character substitutions
done
# ------------------
# Remaining [.!?] permutations:
sed -i 's/!?/?/g' tmp_file
sed -i 's/?!/?/g' tmp_file
sed -i 's/?\./?/g' tmp_file
sed -i 's/!\./!/g' tmp_file
sed -i 's/\.?/?/g' tmp_file
sed -i 's/\.!/!/g' tmp_file
# ============================================================================
# BRACKETS:
# These can be annoying, especially re: processing. They are important in
# chemistry / biochemistry, however (e.g. chemical / biochemical names), so
# for now just do the usual substitute / replace later approach.
# The order of these steps is important: do ( [ {, then ) ] } associated
# with periods (to split at those), then do left-over ( ) [ ] { }.
# ----------------------------------------
# Simplify [{ as ( ; simplify ]} as ) :
sed -i 's/\[/(/g' tmp_file ## \-escape the [ : \[
sed -i 's/]/)/g' tmp_file
sed -i 's/{/(/g' tmp_file
sed -i 's/}/)/g' tmp_file
# ----------------------------------------
# Angle brackets { < | > }:
# Deal with these first: (angle brackets used as mathematical inequalities);
# include "p" to capture (e.g.) "p < 0.001" or "p > 0.001 :
sed -i -r 's/([0-9p])\s?<\s?([0-9])/\1Woxoh4ph\2/g' tmp_file
sed -i -r 's/([0-9p])\s?>\s?([0-9])/\1aeja8ohM\2/g' tmp_file
# not "sed -r", therefore \-escape "?" (regex 0 or 1 modifier) -- \? :
sed -i 's/\s\?<\s\?=\s\?/aev3Shoo/g' tmp_file
sed -i 's/\s\?>\s\?=\s\?/iez7ieVi/g' tmp_file
# ... then remove all other angle brackets:
sed -i 's/</(/g' tmp_file
sed -i 's/>/)/g' tmp_file
# ----------------------------------------
# Delete spaces following leading parentheses; delete spaces preceding lagging parentheses:
sed -i 's/(\s\?/(/g' tmp_file
sed -i 's/\s\?)/)/g' tmp_file
# ----------------------------------------
# Delete empty and multiple parentheses:
sed -i 's/(\s\?)//g' tmp_file
sed -i 's/(\{2,\}/(/g' tmp_file
sed -i 's/)\{2,\}/)/g' tmp_file
# ----------------------------------------
# Split parentheses associated with punctuation (.?!) at the ends of sentences:
sed -i 's/\.)\s\?/.)\n/g' tmp_file
sed -i 's/\.\s\?(/.\n(/g' tmp_file
sed -i 's/?)\s\?/?)\n/g' tmp_file
sed -i 's/?\s\?(/?\n(/g' tmp_file
sed -i 's/!)\s\?/!)\n/g' tmp_file
sed -i 's/!\s\?(/!\n(/g' tmp_file
# ----------------------------------------
# Split lines on ") (", only if first parenthesized expression is at the end of a sentence:
sed -i 's/[.!?]\s\?)\s\?(/.)\n(/g' tmp_file
# ----------------------------------------
# Clean up: remove parentheses at start or end of lines:
# First, (again) remove all leading and trailing whitespace from sentences, as well as multiple spaces:
sed -i 's/^[ \t]*//; s/[ \t]*$//' tmp_file ## two (chained) sed expressions
sed -i 's/^(//g' tmp_file
sed -i 's/)$//g' tmp_file
# ============================================================================
# AUTHOR INITIALS; JOURNAL TITLE ABBREVIATIONS:
# =============================================
sed -i -r 's/(\.[A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file
# (Proc.NatlShah7aAcad.SciShah7aUShah7aS.AShah7a104, 9346
sed -i -r 's/(Shah7a[A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file
# (Proc.NatlShah7aAcadShah7aSciShah7aUShah7aSShah7aAShah7a104, 9346
# ----------------------------------------------------------------------------
# Match abbreviations at the start of a line.
sed -i -r 's/(^[A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file
# ----------------------------------------------------------------------------
# Capture the first abbreviation inside a parenthesis ( ( ):
sed -i -r 's/(\([A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file ## \-escaped, literal ( inside () character substitution
# ----------------------------------------------------------------------------
# Authors' names -- additional processing:
# ------------------
# Match hyphenated names abbreviations (e.g. Chen A.-B. Jiang):
sed -i -r 's/\.-([A-Z])\./Shah7a-\1Shah7a/g' tmp_file
# ------------------
# Clean up { space Cap(range 1:4 Caps) dot Cap | space Cap dash Cap dot } patterns:
# William F.JShah7aMcLeod | A.BCD.Smith | ABCD.Smith | Chen A-B.JiangShah7a | ...
sed -i -r 's/\s([A-Z]{1,4})\.([A-Z])/ \1Shah7a\2/g' tmp_file
# Rule above prevents " HCC. More", etc., from being split (HCC: hepatocellular carcinoma).
# I deal with it via custom splits, in "post-processing."
sed -i -r 's/\s([A-Z]-[A-Z])\./ \1Shah7a/g' tmp_file
# ============================================================================
# SPLIT SENTENCES ONTO SEPARATE LINES:
# ------------------------------------
# Here we want to process the remaining periods to split sentences onto
# separate lines, with the caveats (i) that we do not want to split decimal
# numbers (3.1; ... i.e. [0-9].[0-9]), and (ii) we do not want to (as much
# as practically possible) split abbreviations (journal titles; authors; ...).
# sed -i -r 's/([\.?!])\s?([A-Z][A-Za-z0-9 ,-]{4,})/\1\n\2/g' tmp_file ## literal space, [ ] inside that [A-Za-z ,-] character class
# Expression above failed to split: .TGFβ -- corrected here:
sed -i -r 's/([\.?!])\s?([A-Z][A-Za-z0-9αβγδεζηθικλμνξοπρςστυφχψω ,-]{3,})/\1\n\2/g' tmp_file
# ============================================================================
# RESTORATIONS:
# =============
# ----------------------------------------
# (Re-)delete leading and trailing whitespace from sentences, as well as
# multiple spaces (if present / inadvertently reintroduced):
sed -i 's/^[ \t]*//;s/[ \t]*$//' tmp_file
# Replace multiple spaces with single space:
sed -i 's/ */ /g' tmp_file
# ----------------------------------------------------------------------------
# Restorations -- amino acids (e.g.: p.Arg in p.Arg62His):
sed -i 's/HieN7uuP/p.Ala/g' tmp_file
sed -i 's/Nae0RaeZ/p.Arg/g' tmp_file
sed -i 's/see7AuK6/p.Asn/g' tmp_file
sed -i 's/chaeJeu1/p.Asp/g' tmp_file
sed -i 's/EiV6Gaix/p.Cys/g' tmp_file
sed -i 's/Ufaiph2b/p.Gln/g' tmp_file
sed -i 's/Goh8eish/p.Glu/g' tmp_file
sed -i 's/xei1Phei/p.Gly/g' tmp_file
sed -i 's/aak0eVei/p.His/g' tmp_file
sed -i 's/vai9aeS3/p.Ile/g' tmp_file
sed -i 's/ohzah5Ei/p.Leu/g' tmp_file
sed -i 's/Oa4Aequo/p.Lys/g' tmp_file
sed -i 's/TheeWie7/p.Met/g' tmp_file
sed -i 's/ohNa9pe0/p.Phe/g' tmp_file
sed -i 's/Eetaib7k/p.Pro/g' tmp_file
sed -i 's/ga3yeeGh/p.Trp/g' tmp_file
sed -i 's/DuY2Gub7/p.Tyr/g' tmp_file
sed -i 's/oezoo9Ca/p.Ser/g' tmp_file
sed -i 's/wahRoo7E/p.Thr/g' tmp_file
sed -i 's/ieKai4oo/p.Val/g' tmp_file
# ----------------------------------------
# Restore single quotations:
sed -i "s/tho6Si2o/3'/" tmp_file
sed -i "s/oochie8P/5'/g" tmp_file
sed -i "s/ohph5AN6/A's/g" tmp_file
sed -i "s/Ji4oopow/C's/g" tmp_file
sed -i "s/Aeyahk4A/G's/g" tmp_file
sed -i "s/oogeel3W/T's/g" tmp_file
# Restore angle brackets used as mathematical inequalities:
sed -i 's/Woxoh4ph/ < /g' tmp_file
sed -i 's/aeja8ohM/ > /g' tmp_file
sed -i 's/aev3Shoo/ <= /g' tmp_file
sed -i 's/iez7ieVi/ >= /g' tmp_file
# ----------------------------------------
# Restore common contractions:
sed -i "s/chaSaib7/'d/g" tmp_file
sed -i "s/UivahJ5e/'ll/g" tmp_file
sed -i "s/chahei1O/'m/g" tmp_file
sed -i "s/Zeep7Auy/'t/g" tmp_file
sed -i "s/Zeep7Auy/'nt/g" tmp_file
sed -i "s/Phoh5eil/'re/g" tmp_file
# ------------------
sed -i "s/eKu6eech/six/g" tmp_file
sed -i "s/pahl8Avu/seven/g" tmp_file
sed -i "s/zaoGii5p/'s/g" tmp_file
sed -i "s/ueKek3oh/' /g" tmp_file
# ------------------
sed -i "s/iCuRahb6/'t/g" tmp_file
sed -i "s/Roopes5f/'ve/g" tmp_file
# less common / archaic:
sed -i "s/Quei2Eex/ma'am/g" tmp_file
sed -i "s/IeDae7Lu/ne'er/g" tmp_file
sed -i "s/Xahc3Iel/o'/g" tmp_file
sed -i "s/uph4aida/'twas/g" tmp_file
# ----------------------------------------
# Restore biochemical, chemical primes:
sed -i "s/hooPhil4/1'/g" tmp_file
sed -i "s/He5EiS1Z/2'/g" tmp_file
sed -i "s/IeghuP3V/3'/g" tmp_file
sed -i "s/Loh4aeri/4'/g" tmp_file
sed -i "s/Aht9Vohs/5'/g" tmp_file
sed -i "s/ReiR5zee/6'/g" tmp_file
sed -i "s/eiTei4ri/7'/g" tmp_file
sed -i "s/ay0ePicu/8'/g" tmp_file
sed -i "s/seeHush2/9'/g" tmp_file
# ----------------------------------------
# Restore version (v.):
sed -i -r 's/Eegh5eel/v./g' tmp_file
# ----------------------------------------
# Restore versus ("vs."):
sed -i 's/Air5ah/vs. /g' tmp_file
# ----------------------------------------
# Restore "e.g." and "i.e.":
sed -i 's/Va1Eed/e.g. /g' tmp_file
sed -i 's/Uchee4/i.e. /g' tmp_file
# Capitalize restored "e.g.", "i.e.", "c.f." present at the start of a sentence:
sed -i 's/^c\.f\./Cf. /g' tmp_file
sed -i 's/^e\.g\./E.g. /g' tmp_file
sed -i 's/^i\.e\./I.e. /g' tmp_file
# ----------------------------------------
# Restore "c.c." and "cf.":
sed -i 's/Ri9Ohk/ cc. /g' tmp_file
sed -i 's/Tig8shei/cf. /g' tmp_file
# ----------------------------------------
# Restore page number abbreviations {pp. | p.}:
sed -i 's/Cho4Ph/pp./g' tmp_file
sed -i 's/Eiph2T/ p./g' tmp_file
# ----------------------------------------
# Restore ellipses (...):
sed -i 's/Iet1auki/ ... /g' tmp_file ## add space before ...
# ... and split line if following character is a Capital letter:
sed -i -r 's/\.\.\.(\s?[A-Z])/...\n\1/g' tmp_file
# ... and delete line is it consists solely of an ellipsis ("...") [optionally with spaces]:
sed -i -r 's/^\s{0,}\.\.\.\s{0,}$//g' tmp_file
# ----------------------------------------
# Restore et al. :
sed -i 's/et al/et al. /g' tmp_file
# ----------------------------------------
# Restore "Fig.", "fig.", "Figs.", "figs.":
sed -i -r 's/([fF]ig[s]),/\1\. /g' tmp_file
# ----------------------------------------
# Restore personal titles (replace ',' with '.'):
sed -i 's/St,/St. /g' tmp_file
sed -i 's/Ms,/Ms. /g' tmp_file
sed -i 's/Mrs,/Mrs. /g' tmp_file
sed -i 's/Mr,/Mr. /g' tmp_file
sed -i 's/Drs,/Drs. /g' tmp_file
# ----------------------------------------------------------------------------
# Miscellany: split St. at end of sentence:
sed -i -r 's/\sSt.\s?([A-Z])/ St.\n\1/g' tmp_file
# ----------------------------------------------------------------------------
# Lastly , restore author initials, journal title abbreviations:
sed -i 's/Shah7a/./g' tmp_file
# ============================================================================
# POSTPROCESSING:
# ===============
# ----------------------------------------------------------------------------
# Delete { ---------- | ========== }-type lines:
# [I often use these to delimiter sections of text.]
sed -i '/^[-=]*$/d' tmp_file
# Deletes all of these:
# ---------------------
# =====================
# --------=====--------
# =====----------=====
# ----------------------------------------
# Remove unterminated lines (no terminal ".!?", often due to citations):
# sed -i -r 's/^.*[^.!?]$//g' tmp_file
# Ack! Expression above ** appears** to delete lines ending in ellipsis (...)
# (closer inspection: those ellipses were followed by spaces).
# Workaround -- first remove spaces at EOL:
sed -i 's/\s\s*$//g' tmp_file
# .. THEN remove unterminated lines (excluding also those ending in an ellipsis):
sed -i -r 's/^.*[^.{1,3}?!]$//g' tmp_file
# Converts:
#
# DIR: Cellular Signaling - TGFβ Signaling Pathway.
# SUBJ: Cell signaling interaction may prevent key step in lung cancer progression.
# Date: November 9, 2017
# Source: University of Kentucky
# Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer!
# Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer?
# A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC
# Madeline Krentz Gober, James P. Collard, Katherine Thompson & Esther P. Black
# Scientific Reports 7, Article number: 4202 (2017
# ABSTRACT
# Our previous work identified a miRNA signature in Non-Small Cell Lung Cancer cell lines.
#
# to
#
# DIR: Cellular Signaling - TGFβ Signaling Pathway.
# SUBJ: Cell signaling interaction may prevent key step in lung cancer progression.
# Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer.
# Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer!
# Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer?
# Our previous work identified a miRNA signature in Non-Small Cell Lung Cancer cell lines.
# ----------------------------------------
# "unsplit" sentences:
# As a consequence of processing abbreviations, some existing abbreviations
# get captured; e.g. [original text] "... in PD.Overall ...". That PD
# (Parkinson's Disease) abbreviation gets preprocessed by his script as ah
# abbreviation [PDShah7a], and so it is not present during the sentence
# splitting step; the period (hence unsplit sentence) is added when "Shah7a"
# is replaced with a period.
sed -i -r 's/\s([A-Z]{2,4})\.([A-Z])/ \1.\n\2/g' tmp_file
# Also, some citations at the ends of sentences do not get split:
sed -i -r 's/([ a-z])([0-9]{1,3})\.([A-Z])/\1\2.\n\3/g' tmp_file
# Matches a space or lowercase letter followed by 1-3 numbers followed by a
# period followed by a capital letter; e.g. "response17.TGFβ" or "response 17.TGFβ".
# ----------------------------------------
# "Run-on" names, e.g. "Esther P.Black" (will get tokenized as "p.black" ...:
# sed -i -r 's/([A-Z])\.([A-Z])/\1. \2/g' tmp_file
# Problem -- expression above also (e.g.) converts "Sci.U.S.A." to "Sci.U. S.A."
# Workaround:
sed -i -r 's/([a-z]\s?[A-Z])\.([A-Z])/\1. \2/g' tmp_file
# Authors initials with spaces (remove spaces) -- e.g. " A. B. Charles" >> " A.B.Charles" :
sed -i -r 's/\s([A-Z])\.\s([A-Z])\./ \1.\2./g' tmp_file
# ----------------------------------------
# Dot space:
# sed -i 's/\.\s\?/./g' tmp_file
# ----------------------------------------
# Miscellaneous unsplit:
# As mentioned above, some abbreviations (e.g. hepatocellular carcinoma: Hcc | Hcc)
# must be processed via "one-of" rules:
sed -i 's/\s\?[Hh][Cc][Cc]\./HCC.\n/g' tmp_file
# ------------------
sed -i -r 's/([a-z]{2,})\.\s?([0-9])([A-Z])/\1.\n\2\3/g' tmp_file ## e.g.: lines.2OH-BNPP1
# Oops: this splits "invasion. km23" but also splits "pp. iii"; "vs. that"; ...
# sed -i -r 's/([a-z]{2,})\.\s?([a-z]{2,})/\1.\n\2/g' tmp_file ## e.g.: invasion. km23-1
# Facile solution -- extend match length:
sed -i -r 's/([a-z]{4,})\.\s?([a-z]{2,})/\1.\n\2/g' tmp_file
sed -i -r 's/([A-Za-z0-9])\.\s?([αβγδεζηθικλμνξοπρςστυφχψω])/\1.\n\2/g' tmp_file ## e.g.: CBX7. β3 | manner. β3
sed -i -r 's/([a-z]{2,})\.\s?([0-9])([A-Z])/\1.\n\2\3/g' tmp_file ## e.g.: lines.2OH-BNPP1
# ----------------------------------------
# Delete any remaining empty / blank lines (if they exist):
sed -i '/^\s*$/d' tmp_file ## * : 0 or more instances (here, of spaces: \s)
perl -i -pe 's/^'`echo "\012"`'${2,}//g' tmp_file ## 012 is the octal form of \n
# ----------------------------------------
# Delete any remaining multiple spaces:
sed -i 's/\s\s\?/ /g' tmp_file
# ----------------------------------------
# Delete space comma:
sed -i 's/\s,/,/g' tmp_file
# ----------------------------------------
# Delete spaces at beginning of lines:
sed -i 's/^\s\s\?//g' tmp_file
# ============================================================================
# FINAL SED OPERATION:
# ====================
# ----------------------------------------------------------------------------
# Final sed operation; output to file:
sed -i 's/Dr,/Dr. /g' tmp_file
# ----------------------------------------------------------------------------
# Create output files, into PREEXISTING ./output directory:
# http://pubs.opengroup.org/onlinepubs/007908799/xcu/basename.html
# https://stackoverflow.com/questions/15803227/getting-permission-denied-on-dirname-and-basename
# https://stackoverflow.com/questions/7194192/basename-with-spaces-in-a-bash-script
outname=$(basename "$f")
mv "tmp_file" "output/$outname"
done
# https://stackoverflow.com/questions/4638874/how-to-loop-through-a-directory-recursively-to-delete-files-with-certain-extensi
# At top of script: IFS=$'\n'; set -f
# Unset here:
unset IFS; set +f
# ----------------------------------------------------------------------------
# SIGNAL END OF SCRIPT EXECUTION:
# for i in 1 2 3 4 5
for i in 1 2 3
do
{
#aplay alarm-frenzy.mp3 ## << aplay cannot play MP3 files; use WAV
#aplay beep.wav
#aplay ding.wav
aplay /mnt/Vancouver/Programming/scripts/PHASER.WAV
#aplay /mnt/Vancouver/Programming/scripts/KenbeepLoud.wav
sleep 0.25
#echo "Welcome $i times"
} &> /dev/null
## re: above - suppresses aplay echo in terminal, per:
## http://stackoverflow.com/questions/18062778/how-to-hide-command-output-in-bash
done
# ============================================================================
# Q.E.D.! :-D
# ============================================================================