# Regular Expressions

In [5]:
import re
# Python offers three kinds of matching with regular expressions
# (1) re.match matches the pattern with the beginning of the string
# (2) re.search matches the pattern anywhere in the string, but only 
# retrieves the first occurrence. 
# (3) re.findall finds all matches in the string.

In [6]:
# tries to match a lowercase letter or a digit at the beginning of a string
# if there is a match, the result shows where the match occured (span) 
# and the matched substring (match)
l = ['data science', '1datascine123', '123', '*3az']
for e in l:
    match = re.match("[a-z0-9]",e)
    if match:
        print('STRING:',e,'MATCH:',match)
    else:
        print(e, "does not match")

STRING: data science MATCH: <re.Match object; span=(0, 1), match='d'>
STRING: 1datascine123 MATCH: <re.Match object; span=(0, 1), match='1'>
STRING: 123 MATCH: <re.Match object; span=(0, 1), match='1'>
*3az does not match


In [None]:
# tries to match a lowercase letter or a digit at the beginning of a string
# if there is a match, the result shows where the match occured (span) 
# and the matched substring (match)
l = ['data science', '1datascine123', '123', '*3az']
for e in l:
    match = re.match("[a-z0-9]",e)
    if match:
        print('STRING:',e,'MATCH:',match)
    else:
        print(e, "does not match")

In [3]:
# using search
l = ['data science', '1datascine123', '123', '*3az']
for e in l:
    match=re.search("[a-z0-9]",e)
    if match:
        print('STRING:',e,'MATCH:',match)
    else:
        print(e, "does not match")

STRING: data science MATCH: <re.Match object; span=(0, 1), match='d'>
STRING: 1datascine123 MATCH: <re.Match object; span=(0, 1), match='1'>
STRING: 123 MATCH: <re.Match object; span=(0, 1), match='1'>
STRING: *3az MATCH: <re.Match object; span=(1, 2), match='3'>


In [4]:
# cleaner output, different pattern (starts with a lowercase letter followed by a digit)
l = ['data sc2ienc5e', '2datascine123', '123']
for e in l:
    match = re.search("[a-z][0-9]",e)
    if match:
        print('STRING:',e,'MATCH:',match.group(0))
    else:
        print(e, "does not match")

STRING: data sc2ienc5e MATCH: c2
STRING: 2datascine123 MATCH: e1
123 does not match


In [22]:
# using findall
l = ['data sc2ienc5e', '1datascine123', '123']
for e in l:
    match=re.findall("[a-z][0-9]",e)
    print('STRING:',e,'MATCH:',match)

STRING: data sc2ienc5e MATCH: ['c2', 'c5']
STRING: 1datascine123 MATCH: ['e1']
STRING: 123 MATCH: []


In [23]:
# pattern is starts with a B, followed by :, followed by one or more characters 
# that are not spaces, followed by a colon
# + is greedy here; matches as much as possible
# r prefix for the string tells Python to preserve any backslash
l2 = ['Boy is ','B:ddd:','C:ddd:','B:ddd ', 'B:ddd:xxx']
for e in l2:
    match = re.match(r'^B:\S+:',e)
    if (match):
        print('STRING:',e,'MATCH:',match.group(0))
    else: 
        print(e,'no match')

Boy is  no match
STRING: B:ddd: MATCH: B:ddd:
C:ddd: no match
B:ddd  no match
STRING: B:ddd:xxx MATCH: B:ddd:


In [24]:
# non-greedy match with the use of ?, no colon in pattern
l2 = ['Boy is ','B:ddd:','C:ddd:','B:ddd ']
for e in l2:
    match = re.match(r'^B:\S+?',e)
    if (match):
        print('STRING:',e,'MATCH:',match.group(0))
    else: 
        print(e,'no match')

Boy is  no match
STRING: B:ddd: MATCH: B:d
C:ddd: no match
STRING: B:ddd  MATCH: B:d


In [80]:
#with files
f = open('../data/beatles_biography.txt', 'r')
# read the entire file
filestring = f.read()
#Feed the file text into findall(); it returns a list of all the found strings
strings = re.findall(r'India.*', filestring)
print(strings)
print()
#find occurrences of a year (e.g. 1974)
datestrings = re.findall(r'[0-9]{4}', filestring)
print(datestrings)
print()
#find years with some context before and after
# note this pattern is not able to pull up all instances of years
# can you change the pattern to include more years?
datestrings = re.findall(r'[A-Za-z]+\s[0-9]{4}[.,\s]\s*[A-Za-z]*', filestring)
print(datestrings)

['India, the Beatles visited the Maharishi Mahesh Yogi in India.']

['1960', '1943', '2001', '1940', '1980', '1942', '1940', '1990', '1935', '1977', '1936', '1959', '1932', '1956', '1957', '1960', '1962', '1963', '1964', '1964', '1964', '1965', '1965', '1966', '1967', '1967', '1968', '1968', '1968', '1969', '1933', '1970', '1969', '1940', '1970', '1971', '1970', '1971', '1970', '1980', '1990', '2001', '1988']

['in 1956. Paul', 'in 1957. Fourteen', 'in 1960.\nWhen', 'January 1964, hit', 'in 1964 the', 'July 1965. It', 'and 1966 albums', 'In 1968 they', 'July 1968. A', 'of 1968 and', 'before 1970 impossible', 'in 1970. The', 'in 1971, got', 'of 1970 all', 'In 1971 McCartney', 'in 1988. Lennon']


# Tokenizing and Stemming

In [2]:
# This will open a separate window where you have to select the packages to download. 
# Select all.
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
tokens

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [21]:
# use of regular expression in tokenization
text = 'Rutgers offers an MS in Data Science.'
pattern = r'[A-Za-z]+'
nltk.regexp_tokenize(text, pattern)

['Rutgers', 'offers', 'an', 'MS', 'in', 'Data', 'Science']

In [30]:
# note the use of ( ) to break the line into multiple lines
from nltk.stem.porter import *
stemmer = PorterStemmer()
plurals = (['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
            'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization',
            'sensational', 'traditional', 'reference', 'colonizer','plotted'])
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot
