-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCh11Notes.py
174 lines (125 loc) · 5 KB
/
Ch11Notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Chapter 11 Notes: Regular Expressions
## need to import re before using reg. expressions
##### 11.1 #####
# Search for lines that contain 'From'
import re ## dont forget this
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if re.search('From:',line): ## like a find all function
print(line)
# Search for lines that start with 'From'
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if re.search('^From:', line): ## now a 'starts with'
print(line)
# Search for lines that start with 'F', followed by
# 2 characters, followed by 'm:'
# using the period or '.'
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if re.search('^F..m:', line):
print(line)
# Search for lines that start with From and have an at sign
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if re.search('^From:.+@') #starts with From, has one or more characters after it (the period +) and contains @
print(line)
## the + is one or more, * is 0 ore more
## the + and * are 'pushy' meaning that they will 'push' to the last character. ie if there are multiple @
## in a line, they will not stop until the they get to the last one (possible to turn off but complicated :-))
##### 11.2 #####
import re
t = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
lst = re.findall('\S+@\S+', t) ## the \S+ has meaning in regular expressions
print(lst)
## \S+ matches to nonewhitespace characters
## note that since there is no non-white space before @2pm, its does not get picked up
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
x= re.findall('\S+@\S+',line)
if len(x)>0: ## note that you need this so you don't just end up print every line
print(x)
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
x = re.findall('[a-zA-Z0-9]\S*@\S*[a-zA-Z]',line)
if len(x)>0:
print(x)
##### 11.3 #####
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
x = re.findall('^X\S.*: [0-9.]+',line) ##note that inside the [] the period is just a period
if len(x)>0:
print(x)
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
x = re.findall('^X\S*: ([0-9.]+)',line) ##using the () we are telling python that we just want that part
if len(x)>0:
print(x)
import re
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
x = re.findall('^From .* ([0-9][0-9]):', line)
if len(x) > 0:
print(x)
##### 11.4 Escape Character #####
import re
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+',x)
# Consolidated grouping of all regular expression characters:
# ˆ Matches the beginning of the line.
# $ Matches the end of the line.
# . Matches any character (a wildcard).
# \s Matches a whitespace character.
# \S Matches a non-whitespace character (opposite of \s).
# * Applies to the immediately preceding character(s) and indicates to match zero or more times.
# *? Applies to the immediately preceding character(s) and indicates to match zero or more times in “non-greedy mode”.
# + Applies to the immediately preceding character(s) and indicates to match one or more times.
# +? Applies to the immediately preceding character(s) and indicates to match one
# or more times in “non-greedy mode”.
# \b Matches the empty string, but only at the start or end of a word.
# \B Matches the empty string, but not at the start or end of a word.
# \d Matches any decimal digit; equivalent to the set [0-9].
# \D Matches any non-digit character; equivalent to the set [ˆ0-9].
# ? Applies to the immediately preceding character(s) and indicates to match zeroor one time.
# ?? Applies to the immediately preceding character(s) and indicates to match zeroor one time in “non-greedy mode”.
# [aeiou] Matches a single character as long as that character is in the specified set. In this example, it would match “a”, “e”, “i”, “o”, or “u”, but no other characters.
# [a-z0-9] You can specify ranges of characters using the minus sign. This example is a single character that must be a lowercase letter or a digit.
# [ˆA-Za-z] When the first character in the set notation is a caret, it inverts the logic. This example matches a single character that is anything other than an uppercase or lowercase letter.
### Square Brackets ####
import re
x ='My 2 favorite numbers are 19 and 42'
y = re.findall('[0-9]+',x)
print(y)
y = re.findall('[AEIOU]+',x)
#this is an example about being 'greedy' (or how not to be) using '?'
import re
x = 'From: Using the : character'
y = re.findall('^F.+?:',x)
print(y)
line = 'From Stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'
words=line.split()
email=words[1]
pieces = email.split('@')
print(pieces[1])
import re
line = 'From Stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'
y= re.findall('@([^ ]*)',line)
print(y)
print(type(y))
#