# Feature Extraction :

## I - Using Weak Labels :

In [None]:
import re

def generate_weak_labels(text):
    variable_pattern = re.compile(r'\b[A-Za-z_][A-Za-z0-9_]*\b')
    name_pattern = re.compile(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b')

    tokens = text.split()
    labels = []
    for token in tokens:
        if name_pattern.match(token):
            labels.append("B-NAME")
        elif variable_pattern.match(token):
            labels.append("B-VAR")
        elif value_pattern.match(token):
            labels.append("B-VAL")
        else:
            labels.append("O")
    return tokens, labels

annotated_data = []
for pdf in pdf_texts:
    tokens, labels = generate_weak_labels(pdf['text'])
    annotated_data.append({"tokens": tokens, "labels": labels})

In [None]:
# variable match
var_match = r"\b[A-Za-zα-ωΑ-Ω]{{1}}.*?\b"
#linking verbs
lin_ver = r"(is|are)\b"
# passive constructions (passive verbs)
pass_cons = r".*?(called|defined as|used for|termed)\s"
# definition verbs
def_ver = r"(represents|denotes|stands for|refers to|corresponds to|)"
# start/end of phrase
eop = r"(.*?)(?=\n|,|\.)"


text2 = "The sample space S is an event, called the certain event, and the empty"
pattern = fr"\b[A-Za-zα-ωΑ-Ω]{{1}}.*?\b(is|are)\b.*?(called)\s(.*?)(?=\n|,|\.)"
matches = re.findall(pattern, text2)
matches

[('is', 'called', 'the certain event')]

In [2]:
import re
# Define variables for each part of the regex pattern
math_opp = "ℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅"
variable = fr'([A-Za-zα-ωΑ-Ω0-9{math_opp}]+)'
verb = r'\s+(is|are)\s+'
action = r'(called|defined as|used for|termed)\s+'
end = r'(?=,|\s|\.|\n)'

# Construct the regex pattern using an f-string
pattern = fr"{variable}{verb}{action}{variable}{end}"

# Sample text
#text = """
#v is called velocity, which is a vector quantity.
#s are termed Displacement, and it measures the change in position.
#a is called Acceleration, and it measures the rate of change of velocity.
#"""

# Find all matches
matches = re.findall(pattern, text)

# Print out the matches
for match in matches:
    print(f"Variable: {match[0]}, Definition: {match[3]}")

In [None]:
([A-Za-zα-ωΑ-Ω0-9ℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]+)

In [None]:
[A-Za-z]+\s+([A-Za-zα-ωΑ-ΩℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]{1})\s+

In [None]:
# variable-value
"\b[A-Za-zα-ωΑ-ΩℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]{1,2}\s+=(\s+\d+\s+|\s+\d+\.\d+\s+)"
\w+\s+(\w{1}|\w{1}(.))\s+

In [None]:
# variables in the form of "The Wigner distribution W(x, k, t)"
#(?:[.,]\s*)?\b\w+\s+\w+\s+\w+\s+\w+\s*\(\s*\b\w\s*(?:\s*,\s*\b\w\s*)*\)

# where x = somethiing is type of match
# where\s+([A-Za-zα-ωΑ-Ω0-9ℰℓℒℳøℂℕℙℚℝℤΓΔΛΞΠΣΦΨΩÅℏ∞∘∂∫∮∯∇αβγ∅]{1,2}.+)\s+is\s+(.[^.,]+)



## II - Using POS Tagging :

In [15]:
import spacy
import pandas as pd
def process_text(text):

    nlp = spacy.load('en_core_web_sm')
    # Process the text with spaCy
    doc = nlp(text)

    # Initialize lists to store data
    tokens = []
    deps = []
    heads = []

    # Iterate over tokens in the processed document
    for token in doc:
        tokens.append(token.text)
        deps.append(token.dep_)
        heads.append(token.head.text)

    # Create a DataFrame from the lists
    df = pd.DataFrame({
        'Token': tokens,
        'Dependency': deps,
        'Head': heads
    })

    return df

In [19]:
text = "the velocity f(x) = 20"

df = process_text(text)
df

Unnamed: 0,Token,Dependency,Head
0,the,det,f(x
1,velocity,compound,f(x
2,f(x,nsubj,=
3,),punct,f(x
4,=,ROOT,=
5,20,attr,=


In [26]:
import spacy

# Load the English NLP pipeline
nlp = spacy.load('en_core_web_sm')

# Example sentence
sentence = "a = b"

# Process the sentence with spaCy
doc = nlp(sentence)

# Initialize a set to store identified variables
variables = set()

# Iterate over tokens in the processed document
for token in doc:
    print(token , token.pos_ ,)

a DET
= PROPN
b NOUN


In [30]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Example text
text = "we have a / b"

# Tokenize the text into sentences and words
sentences = sent_tokenize(text)
words = word_tokenize(sentences[0])

# Perform POS tagging
tagged_words = nltk.pos_tag(words)

# Print the tagged words
print(tagged_words)

[('we', 'PRP'), ('have', 'VBP'), ('a', 'DT'), ('/', 'NN'), ('b', 'NN')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [39]:
import nltk
from nltk.tokenize import word_tokenize


# Define symbols to be tagged with custom POS tags
symbols = [
    'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω',
    'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η', 'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Ρ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ', 'Ψ', 'Ω',
    'ℰ', 'ℓ', 'ℒ', 'ℳ', 'ø', 'ℂ', 'ℕ', 'ℙ', 'ℚ', 'ℝ', 'ℤ',
    'Γ', 'Δ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω',
    'Å', 'ℏ', '∞', '∘', '∂', '∫', '∮', '∯', '∇',
    '∅'
]

operations = ['+', '-', '*', '/', '=', '^', '%']

# Define custom POS tags for mathematical symbols
mathematical_pos_tags = {symbol: 'MATH_SYMBOL' for symbol in symbols}
opp_pos_tag = {opp: 'MATH_OPP' for opp in operations}

In [40]:
def math_pos_tagger(text):
    words = word_tokenize(text)

    tagged_tokens = []
    for token in words:
        if token in mathematical_pos_tags :
          tagged_tokens.append((token, mathematical_pos_tags[token]))
        elif token in opp_pos_tag :
          tagged_tokens.append((token, opp_pos_tag[token]))
        else:
            tagged_tokens.append((token, nltk.pos_tag([token])[0][1]))

    return tagged_tokens

In [41]:
# Perform custom POS tagging on the example text
text = "And we have ε(x) = 5"
tagged_words = math_pos_tagger(text)

# Print the tagged words
print(tagged_words)

[('And', 'CC'), ('we', 'PRP'), ('have', 'VB'), ('ε', 'MATH_SYMBOL'), ('(', '('), ('x', 'NN'), (')', ')'), ('=', 'MATH_OPP'), ('5', 'CD')]
