In [1]:
import re

```
## Metacharacters are characters with a special meaning:
[]	A set of characters	      "[a-m]"

\	Signals a special sequence (can also be used to escape special characters)	"\d"

.	Any character (except newline character)	"he..o"

^	Starts with	"^hello"


$	Ends with	"world$"

*	Zero or more occurrences	"aix*"

+	One or more occurrences	"aix+"

{}	Exactly the specified number of occurrences	"al{2}"


|	Either or	"falls|stays"

()	Capture and group

```

```bash
[^arn]	Returns a match for any character EXCEPT a, r, and n


[0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present


[0-9]	Returns a match for any digit between 0 and 9

[0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59

[a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case


[+]	In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string

```

In [2]:
txt = "The rain in Spain"   #The findall() function returns a list containing all matches.
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


In [3]:
txt = "The rain in Spain"  #Return an empty list if no match was found:
x = re.findall("ais", txt)
print(x)

[]


In [4]:
Words = 'the quick brownTfox runs super! fast: '  
 #split on the ‘T’, ‘!’ and the ‘:’ 
split = re.split('T|!|:',Words)
print(split)

['the quick brown', 'fox runs super', ' fast', ' ']


In [5]:
p = re.compile('[a-e]')
p.findall("Aye, said Mr. Gibenson Stark")

['e', 'a', 'd', 'b', 'e', 'a']

In [6]:

# \d is equivalent to [0-9].
p = re.compile('\d')
print(p.findall("I went to him at 11 A.M. on 4th July 1886"))

['1', '1', '4', '1', '8', '8', '6']


In [11]:
# \d+ will match a group on [0-9], group of one or greater size
p = re.compile('\d+')
print(p.findall("I went to him at 11 A.M. on 4th July 1886"))
p = re.compile('[0-9]+')
print(p.findall("I went to him at 11 A.M. on 4th July 1886"))

['11', '4', '1886']
['11', '4', '1886']


In [9]:
# \w is equivalent to [a-zA-Z0-9_].
p = re.compile('\w')
print(p.findall("He said * in some_lang."))
p = re.compile('[a-zA-Z0-9_]')
print(p.findall("He said * in some_lang."))

['H', 'e', 's', 'a', 'i', 'd', 'i', 'n', 's', 'o', 'm', 'e', '_', 'l', 'a', 'n', 'g']
['H', 'e', 's', 'a', 'i', 'd', 'i', 'n', 's', 'o', 'm', 'e', '_', 'l', 'a', 'n', 'g']


In [13]:
# \w+ matches to group of alphanumeric character.
p = re.compile('\w+')
print(p.findall("I went to him at 11 A.M., he said *** in some_language."))

p = re.compile('[a-zA-Z0-9_]+')
print(p.findall("I went to him at 11 A.M., he said *** in some_language."))

['I', 'went', 'to', 'him', 'at', '11', 'A', 'M', 'he', 'said', 'in', 'some_language']
['I', 'went', 'to', 'him', 'at', '11', 'A', 'M', 'he', 'said', 'in', 'some_language']


In [14]:
# \W matches to non alphanumeric characters.
p = re.compile('\W')
print(p.findall("he said *** in some_language."))

[' ', ' ', '*', '*', '*', ' ', ' ', '.']


In [16]:
# '*' replaces the no. of occurrence of a character.
p = re.compile('ab*')
print(p.findall("ababbaabbb"))

['ab', 'abb', 'a', 'abbb']


In [17]:
from re import split
print(split('\W+', 'On 12th Jan 2016, at 11:02 AM'))

['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']


In [18]:
# Upon matching, 'ub' is replaced by '~*' in "Subject", and in "Uber", 'Ub' is replaced.
print(re.sub('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE))

S~*ject has ~*er booked already


In [37]:
# Upon matching, '$' is replaced by '' .
pattern=r'[$_]'
result=re.sub(pattern, '' ,'$4521$_')
result

'4521'

In [43]:
# Upon matching, '$' is replaced by '' .
pattern=re.compile('[0-9]+')
result=pattern.findall('$4521$_')
result[0]

'4521'

In [25]:
phone = re.compile('\d{3}\-\d{3}\-\d{4}')
# Check if the pattern matches
result = phone.findall("my phone number is 123-456-7890 pleas call me.")
result

['123-456-7890']

In [26]:
# Find the numeric values: matches
matches = re.findall('\d+', 'Smoothie ingredients: 3 bananas and 2 strawberries')

# Print the matches
print(matches)

['3', '2']


In [27]:
print(re.escape("This is Awseome even 1 AM"))

This\ is\ Awseome\ even\ 1\ AM


In [30]:
org_string = "This is a sample string"
pattern = r's'
# Replace all occurrences of character s with an empty string
mod_string = re.sub(pattern, '', org_string )
print(mod_string)

Thi i a ample tring


In [31]:
org_string = "This is a sample string"
pattern = r'[sai]'
# Remove characters 's', 'a' and 'i' from a string
mod_string = re.sub(pattern, '', org_string)
print(mod_string)

Th   mple trng


In [38]:
# Upon matching, '$' is replaced by '' .
pattern=r'[a-zA-Z]'
result=re.sub(pattern, '' ,'this is tupple (45,41) pleas write.')
result

'   (45,41)  .'

In [49]:
import pandas as pd

df = pd.DataFrame(['$40,000*','$40000 conditions attached','$7841','$89562','$21315$'], columns=['Price'])
print(df)

                        Price
0                    $40,000*
1  $40000 conditions attached
2                       $7841
3                      $89562
4                     $21315$


In [51]:
 df['Price'].str.replace(r'\D+', '').astype('int')

0    40000
1    40000
2     7841
3    89562
4    21315
Name: Price, dtype: int64

In [56]:
def clean_text(text):
    text = re.sub(r"\D+",'',text)
    return text
df['Price'].apply(clean_text).astype('int64')

0    40000
1    40000
2     7841
3    89562
4    21315
Name: Price, dtype: int64

In [58]:
list_=['New England Hemophilia Association (NEHA)',
 'Los Gatos Education Foundation (LGEF)',
'#Fly',
'#GivingTuesday',
'See Ya Later',
 'Foundation Inc (SYL)',
'1 Million 4 Anna Foundation']
import pandas as pd
df=pd.DataFrame()
df['str']=list_
df

Unnamed: 0,str
0,New England Hemophilia Association (NEHA)
1,Los Gatos Education Foundation (LGEF)
2,#Fly
3,#GivingTuesday
4,See Ya Later
5,Foundation Inc (SYL)
6,1 Million 4 Anna Foundation


In [68]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"she's","she is",text)
    text = re.sub(r"can't","can not",text)
    text = re.sub(r"\x1a","",text)
    text = re.sub("[()]","",text)
    text = re.sub(r"","",text)
    text = re.sub(r"�","",text)
    result = re.sub(r"[(a-z)]", "",text)
    text = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", "", text)
    return text

In [69]:
df['str']=df['str'].apply(clean_text)
df

Unnamed: 0,str
0,new england hemophilia association neha
1,los gatos education foundation lgef
2,fly
3,givingtuesday
4,see ya later
5,foundation inc syl
6,1 million 4 anna foundation


In [63]:
#
import re
txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'Spain']


In [65]:

txt = "The rain in Spain"
x = re.sub("\s", "_", txt)
print(x)

The_rain_in_Spain


In [8]:
text = "The film Titanic was released in 1998"
result = re.findall(r".*", text)
result

['The film Titanic was released in 1998', '']

In [10]:
result = re.findall(r".+", text)
result

['The film Titanic was released in 1998']

In [11]:
text = "1998 was the year when the film titanic was released"
if re.search(r"^1998", text):
    print("Match found")
else:
    print("Match not found")

Match found


In [12]:
text = "1998 was the year when the film titanic was released"
if re.search(r"1998$", text):
    print("Match found")
else:
    print("Match not found")

Match not found


In [13]:
text = "The film Pulp Fiction was released in year 1994"
result = re.sub(r"[a-z]", "X", text)
print(result)

TXX XXXX PXXX FXXXXXX XXX XXXXXXXX XX XXXX 1994


In [14]:
#You can group multiple patterns to match or substitute in a string using the square bracket. 
text = "The film, '@Pulp Fiction' was ? released _ in % $ year 1994."
result = re.sub(r"[,@\'?\.$%_]", "", text, flags=re.I)
print(result)

The film Pulp Fiction was  released  in   year 1994


In [15]:
# Removing Multiple Spaces
text = "The film      Pulp Fiction      was released in   year 1994."
result = re.sub(r"\s+"," ", text, flags = re.I)
print(result)

The film Pulp Fiction was released in year 1994.


In [None]:
# Removing Spaces from Start and End
text = "         The film Pulp Fiction was released in year 1994"
result = re.sub(r"^\s+", "", text)
print(result)

In [None]:
text = "The film Pulp Fiction was released in year 1994      "
result = re.sub(r"\s+$", "", text)
print(result)

In [16]:
text = "The film Pulp Fiction     s was b released in year 1994"
result = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
print(result)

The film Pulp Fiction was released in year 1994


In [17]:
text = "The film, Pulp Fiction, was released in year 1994"
result = re.split(r"\,", text)
print(result)

['The film', ' Pulp Fiction', ' was released in year 1994']


In [2]:
import re
text = "This is comany name (vishal)"
result = re.sub(r"\(.*?\)", "", text)
print(result)

This is comany name 


In [4]:
text = "This is comany name (vishal)"
result = re.findall(r"\(.*?\)", text)
print(result)

['(vishal)']


In [7]:
text = "India((from natural collection))"
result = re.sub(r"\(\(.*?\)\)", "", text)
print(result)

India


In [8]:
text = "India[(from natural collection)]"
result = re.sub(r"\[\(.*?\)\]", "", text)
print(result)

India


In [29]:
text = "I want to buy a mobile between 200 and 400 euros"
result = re.findall(r"\d+", text)
print(result)

['200', '400']


In [62]:
s = 'foo [[bar]]); baz [[quz]]); not [[foobar]]'
matches = re.findall(r'\[\[(.*?)]]', s, re.S)
matches

['bar', 'quz', 'foobar']

In [96]:
s = 'foo [[bar]]); baz [[quz]]); not [[foobar]]'
matches = re.findall(r'\[\[.*?\]\]',s)
matches

['[[bar]]', '[[quz]]', '[[foobar]]']

In [70]:
s = 'foo [[bar]]); baz [[quz]]); not [[foobar]]'
matches = re.findall(r'\[\[(.*?)\]\]', s)
matches

['bar', 'quz', 'foobar']