In [2]:
import re

# How to split a string separated by a regex?

In [8]:
text = """101 COM   Computers
205 MAT   Mathematics
189 ENG   English""" 

text = re.split('\s+', text)
print(text)

['101', 'COM', 'Computers', '205', 'MAT', 'Mathematics', '189', 'ENG', 'English']


In [None]:
print(text)

['101', '205', '189', '']


# Finding pattern matches using findall, search and match

**re.findall()**

In [9]:
text = """101 COM    Computers
205 MAT   Mathematics
189 ENG   English""" 

regex_num = re.compile('\d+')
regex_num.findall(text)

['101', '205', '189']

**re.search()**

In [None]:
# define the text
text2 = """205 COM    Computers MAT   Mathematics 189"""

# compile the regex and search the pattern
regex_num = re.compile('\d+')
s = regex_num.search(text2)
# print(s)

print('Starting Position: ', s.start())
print('Ending Position: ', s.end())

Starting Position:  0
Ending Position:  3


# How to substitute one text with another using regex?

In [10]:
text = """101   COM \t  Computers
205   MAT \t  Mathematics
189   ENG  \t  English"""

text = re.sub('\t', '_', text)
print(text)

101   COM _  Computers
205   MAT _  Mathematics
189   ENG  _  English


In [None]:
text = "moch ari n"

print(re.sub('\s+', '__', text))

moch__ari__n


# Regex groups

In [None]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""  

**Extract all course numbers**

In [12]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""  

print(re.findall('[0-9]+', text))

['101', '205', '189']


**Extract all course codes**

In [13]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""

print(re.findall('[A-Z]{3}', text))

['COM', 'MAT', 'ENG']


**Extract all course names**

In [14]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""

print(re.findall('[A-Za-z]{4,}', text))

['Computers', 'Mathematics', 'English']


# Regex for text cleansing

In [None]:
text = """
To be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.

You see, problems like this can easily be solved by proper tech. That is NOT the problem.

The problem is everything but tech?.

https://lalalaa.com
"""

text

'\nTo be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.\n\nYou see, problems like this can easily be solved by proper tech. That is NOT the problem.\n\nThe problem is everything but tech?.\n\nhttps://lalalaa.com\n'

**Remove URL**

In [3]:
text = """
To be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.

You see, problems like this can easily be solved by proper tech. That is NOT the problem.

The problem is everything but tech?.

https://lalalaa.com
"""
text = re.sub('http\S+', '', text)
text

'\nTo be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.\n\nYou see, problems like this can easily be solved by proper tech. That is NOT the problem.\n\nThe problem is everything but tech?.\n\n\n'

**Menghapus emoticon dan tanda baca**

In [4]:
text = """
To be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.

You see, problems like this can easily be solved by proper tech. That is NOT the problem.

The problem is everything but tech?.

https://lalalaa.com
"""
text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
text

' To be fair  the app works to an extent  I can register and get my payment code etc  works  well  until the status takes days to update   You see  problems like this can easily be solved by proper tech  That is NOT the problem   The problem is everything but tech    https   lalalaa com '

In [None]:
text = text.strip()
text

'To be fair  the app works to an extent  I can register and get my payment code etc  works  well  until the status takes days to update   You see  problems like this can easily be solved by proper tech  That is NOT the problem   The problem is everything but tech'