## Preprocessing

In [3]:
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spacy.lang.id import Indonesian
nlp_spacy_id = Indonesian()

factory = StopWordRemoverFactory()
Sastrawi_StopWords_id = factory.get_stop_words()
stemmer = StemmerFactory().create_stemmer()

In [4]:
komentar = 'Semangat terus kak Ayu @aytutingting92, sukses utk karir dan keluarga'

## Case Folding

In [5]:
text = komentar.lower()
print(text)

semangat terus kak ayu @aytutingting92, sukses utk karir dan keluarga


## Remove Username

In [6]:
username = re.compile("(?:^|\s)[＠ @]{1}([^\s#<>[\]|{}]+)", re.UNICODE)
text = re.sub(username, '', text) 
print(text)

semangat terus kak ayu sukses utk karir dan keluarga


## Remove Hashtag

In [7]:
hashtag = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
text = re.sub(hashtag, '', text)
print(text)

semangat terus kak ayu sukses utk karir dan keluarga


## Remove Email

In [8]:
email = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
text = re.sub(email, '', text) 
print(text)

semangat terus kak ayu sukses utk karir dan keluarga


## Remove URL

In [9]:
url = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')       
text = re.sub(url, '', text)     
print(text)

semangat terus kak ayu sukses utk karir dan keluarga


## Remove Non Alphabet

In [10]:
text = ' '.join(re.findall(r'\b[a-z]+-?[a-z]+\b',text))
print(text)

semangat terus kak ayu sukses utk karir dan keluarga


## Tokenization

In [11]:
text = nlp_spacy_id(text)
tokens = [token.text for token in text]
text = ' '.join(tokens)
print(text)

semangat terus kak ayu sukses utk karir dan keluarga


## Stopword Removal

In [12]:
temp = [t for t in re.findall(r'\b[a-z]+-?[a-z]+\b',text) if t not in Sastrawi_StopWords_id]
text = ' '.join(temp)
print(text)

semangat terus kak ayu sukses utk karir keluarga


## Stemming

In [13]:
text = stemmer.stem(text)
print(text)

semangat terus kak ayu sukses utk karir keluarga
