In [1]:
from pregex.core.quantifiers import Optional, OneOrMore, AtLeastAtMost
from pregex.core.classes import AnyFrom, AnyDigit, AnyWhitespace, AnyButWhitespace, Any
from pregex.core.tokens import Backslash
from pregex.core.operators import Either
from pregex.core.pre import Pregex

#### Capture URL

In [11]:
# use regex
import re

text = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"
re.findall("(?:https?\:\/\/)?[^\s]+(?:\.com|\.org)[^\s]+", text)

['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

In [4]:
# use pregex
text = "You can find me through GitHub https://github.com/khuyentran1401"

pre = (
    "https://"
    + OneOrMore(AnyButWhitespace())
    + Either(".com", ".org")
    + OneOrMore(AnyButWhitespace())
)
pre.get_matches(text)

['https://github.com/khuyentran1401']

In [5]:
from pregex.core.quantifiers import Optional

text = "You can find me through GitHub http://github.com/khuyentran1401"

pre = (
    "http"
    + Optional("s")
    + "://"
    + OneOrMore(AnyButWhitespace())
    + Either(".com", ".org")
    + OneOrMore(AnyButWhitespace())
)
pre.get_matches(text)

['http://github.com/khuyentran1401']

#### Match URL without a Scheme

In [6]:
text = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"

at_least_one_character_except_white_space = OneOrMore(AnyButWhitespace())
optional_scheme = Optional("http" + Optional("s") + "://")
domain_choice = Either(".com", ".org")

pre = (
    optional_scheme
    + at_least_one_character_except_white_space
    + domain_choice
    + at_least_one_character_except_white_space
)
pre.get_matches(text)

['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

In [7]:
pre.get_pattern()

'(?:https?:\\/\\/)?\\S+(?:\\.com|\\.org)\\S+'

#### Capture Time

In [12]:
pre = AnyDigit()
text = "It is 6:00 pm now"
pre.get_matches(text)

['6', '0', '0']

In [13]:
pre = AnyDigit() + ":" + AnyDigit()
pre.get_matches(text)

['6:0']

In [14]:
pre = OneOrMore(AnyDigit()) + AnyFrom(":") + OneOrMore(AnyDigit())
pre.get_matches(text)

['6:00']

#### Capture Phone Numbers

In [15]:
text = "My phone number is 3452352312 or 345-235-2312 or 345 235 2312 or 345.235.2312"

punctuation = AnyFrom("-", " ", ".")
optional_punctuation = Optional(punctuation)
at_least_one_digit = OneOrMore(AnyDigit())

pre = (
    at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
)
pre.get_matches(text)

['3452352312', '345-235-2312', '345 235 2312', '345.235.2312']

In [16]:
text = "My phone number is 3452352312 or 345-235-2312 or (345) 235-2312 or 345 235 2312 or 345.235.2312"

punctuation = AnyFrom("-", " ", ".")
optional_punctuation = Optional(punctuation)
at_least_one_digit = OneOrMore(AnyDigit())

pre = (
    Optional("(")
    + at_least_one_digit
    + Optional(")")
    + optional_punctuation
    + at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
)
pre.get_matches(text)

['3452352312',
 '345-235-2312',
 '(345) 235-2312',
 '345 235 2312',
 '345.235.2312']

#### Capture Email Address

In [18]:
text = "My email is abcd@gmail.com"

pre = (
    OneOrMore(AnyButWhitespace())
    + "@"
    + OneOrMore(Any())
    + Either(".com", ".org", ".io", ".net")
)

pre.get_matches(text)

['abcd@gmail.com']