# Feature Extraction :

## I - Using Weak Labels :

In [None]:
import re

def generate_weak_labels(text):
    variable_pattern = re.compile(r'\b[A-Za-z_][A-Za-z0-9_]*\b')
    name_pattern = re.compile(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b')

    tokens = text.split()
    labels = []
    for token in tokens:
        if name_pattern.match(token):
            labels.append("B-NAME")
        elif variable_pattern.match(token):
            labels.append("B-VAR")
        elif value_pattern.match(token):
            labels.append("B-VAL")
        else:
            labels.append("O")
    return tokens, labels

annotated_data = []
for pdf in pdf_texts:
    tokens, labels = generate_weak_labels(pdf['text'])
    annotated_data.append({"tokens": tokens, "labels": labels})

In [None]:
# variable match
var_match = r"\b[A-Za-zα-ωΑ-Ω]{{1}}.*?\b"
#linking verbs
lin_ver = r"(is|are)\b"
# passive constructions (passive verbs)
pass_cons = r".*?(called|defined as|used for|termed)\s"
# definition verbs
def_ver = r"(represents|denotes|stands for|refers to|corresponds to|)"
# start/end of phrase
eop = r"(.*?)(?=\n|,|\.)"


text2 = "The sample space S is an event, called the certain event, and the empty"
pattern = fr"\b[A-Za-zα-ωΑ-Ω]{{1}}.*?\b(is|are)\b.*?(called)\s(.*?)(?=\n|,|\.)"
matches = re.findall(pattern, text2)
matches

[('is', 'called', 'the certain event')]

In [2]:
import re
# Define variables for each part of the regex pattern
math_opp = "ℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅"
variable = fr'([A-Za-zα-ωΑ-Ω0-9{math_opp}]+)'
verb = r'\s+(is|are)\s+'
action = r'(called|defined as|used for|termed)\s+'
end = r'(?=,|\s|\.|\n)'

# Construct the regex pattern using an f-string
pattern = fr"{variable}{verb}{action}{variable}{end}"

# Sample text
#text = """
#v is called velocity, which is a vector quantity.
#s are termed Displacement, and it measures the change in position.
#a is called Acceleration, and it measures the rate of change of velocity.
#"""

# Find all matches
matches = re.findall(pattern, text)

# Print out the matches
for match in matches:
    print(f"Variable: {match[0]}, Definition: {match[3]}")

In [None]:
([A-Za-zα-ωΑ-Ω0-9ℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]+)

In [None]:
[A-Za-z]+\s+([A-Za-zα-ωΑ-ΩℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]{1})\s+

In [None]:
# variable-value
"\b[A-Za-zα-ωΑ-ΩℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]{1,2}\s+=(\s+\d+\s+|\s+\d+\.\d+\s+)"
\w+\s+(\w{1}|\w{1}(.))\s+

In [None]:
# variables in the form of "The Wigner distribution W(x, k, t)"
#(?:[.,]\s*)?\b\w+\s+\w+\s+\w+\s+\w+\s*\(\s*\b\w\s*(?:\s*,\s*\b\w\s*)*\)

# where x = somethiing is type of match
# where\s+([A-Za-zα-ωΑ-Ω0-9ℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]{1,2}.+)\s+is\s+(.[^.,]+)



## II - Using custom NER :

In [56]:
# testing basic NER
import spacy

#sample text
text = "The value of epsilon is known as ε = 32 "

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

for ent in doc.ents :
  print(ent.text,ent.label_)

32 CARDINAL


In [64]:
# Define symbols to be tagged with custom POS tags
mathematical_symbols = [
    'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω',
    'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η', 'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Ρ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ', 'Ψ', 'Ω',
    'ℰ', 'ℓ', 'ℒ', 'ℳ', 'ø', 'ℂ', 'ℕ', 'ℙ', 'ℚ', 'ℝ', 'ℤ','Γ', 'Δ', 'Λ', 'Ξ', 'Π', 'Φ', 'Ψ', 'Ω','Å', 'ℏ', '∞',
    '∂', '∮', '∯', '∇','∅']

mathematical_operations = ['∫', '∑', '∏', '√', '+', '-', '*', '/', '=', '^', '%','∩', '∪', '⊂', '⊆', '∈', '∉','∘']

In [73]:
from spacy.tokens import Span

# defining custom NER component
@spacy.Language.component("parsing_math_entities")

def parsing_math_entities(doc):
    entities = list(doc.ents)
    for token in doc:
        if token.text in mathematical_symbols:
          entities.append(Span(doc, token.i, token.i + 1, label="MATH_SYMBOL"))
        elif token.text in mathematical_operations :
          entities.append(Span(doc,token.i,token.i+1,label="MATH_OPP"))

    # adding more entities
    entities = spacy.util.filter_spans(entities)

    # Updatinf the document's entities
    doc.ents = entities

    return doc

In [77]:
# Example text
text = "The value of epsilon is known as ε = 32 "

# Loading the spaCy model
nlp = spacy.load('en_core_web_sm')

# Adding the custom NER component to the pipeline
nlp.add_pipe("parsing_math_entities")

# Process the text with the updated pipeline
doc = nlp(text)

entities = [(ent.text, ent.label_) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Entity', 'Label'])
df

Unnamed: 0,Entity,Label
0,ε,MATH_SYMBOL
1,=,MATH_OPP
2,32,CARDINAL


In [78]:
# defining lambda function for reading text
read = lambda src : open(src,"r").read()

# test-reading a paper
text = read("/content/drive/MyDrive/UH - Final Year Project/Dataset/text/0001001v1.txt")
print(text)

0
0
0
2

n
a
J

1

]
n
y
d
-
u
l
f
.
s
c
i
s
y
h
p
[

1
v
1
0
0
1
0
0
0
/
s
c
i
s
y
h
p
:
v
i
X
r
a

Under consideration for publication in J. Fluid Mech.

1

Capillary-gravity wave transport over
spatially random drift

By G U I L L A U M E B A L∗ and T O M C H O U†
∗ Department of Mathematics, University of Chicago, Chicago, IL 60637
†Department of Mathematics, Stanford University, Stanford, CA 94305

(Received 2 February 2008)

We derive transport equations for the propagation of water wave action in the pres-
ence of a static, spatially random surface drift. Using the Wigner distribution W(x, k, t)
to represent the envelope of the wave amplitude at position x contained in waves with
wavevector k, we describe surface wave transport over static ﬂows consisting of two length
scales; one varying smoothly on the wavelength scale, the other varying on a scale com-
parable to the wavelength. The spatially rapidly varying but weak surface ﬂows augment
the characteristic equations with scat

In [89]:
# Process the text with the updated pipeline
doc = nlp(text)

entities = [(ent.text, ent.label_) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Entity', 'Label'])
df

Unnamed: 0,Entity,Label
0,0,CARDINAL
1,0,CARDINAL
2,0,CARDINAL
3,2,CARDINAL
4,1,CARDINAL
...,...,...
2018,1992,DATE
2019,Kolgomorov Spectra,PERSON
2020,Wave Turbulence,PERSON
2021,Springer-Verlag,PERSON
