# Text Extraction

In [2]:
html_text = "<html><body><h1>Hello< World!\nHello you!</h1></body></html>"

In [3]:
from bs4 import BeautifulSoup
def extract_from_html_using_parser(html_text):

    soup = BeautifulSoup(html_text, 'html.parser')
    extracted_text = soup.get_text()
    return extracted_text

In [4]:
html_parser_result = extract_from_html_using_parser(html_text)
display(html_parser_result)

'Hello< World!\nHello you!'

In [7]:
import re

def get_word_only(text):

    cleaned_text = re.sub(r'[^\w\s]', '', text)
    words = cleaned_text.split()
    return words

In [8]:
only_extraxt_text = get_word_only(html_text)
display(only_extraxt_text)

['htmlbodyh1Hello', 'World', 'Hello', 'youh1bodyhtml']

In [9]:
def clean_html(row_html):
    CLEANER = re.compile('<.*?>')

    cleantext = re.sub(CLEANER, '', row_html)
    return cleantext

In [10]:
cleaned_html_tags = clean_html(html_text)
display(cleaned_html_tags)

'Hello< World!\nHello you!'

# Tweet

In [11]:
tweet = "INi adalah contoh tweet dengan #hashtag dan 🗿 emoji"

In [12]:
import re 

def clean_tweet(tweet):
    
    tweet = re.sub(r'#\w+', '', tweet) # remove hasgtag
    tweet = re.sub(r'[^\w\s]+', '', tweet) # remove no-alphanumeric
    tweet = re.sub(r'\s+', " ", tweet) # remove extra spaces

    return tweet

In [13]:
cleaned_tweet = clean_tweet(tweet)
display(tweet)
display(cleaned_tweet)

'INi adalah contoh tweet dengan #hashtag dan 🗿 emoji'

'INi adalah contoh tweet dengan dan emoji'

In [16]:
from unidecode import unidecode

def clean_tweet_with_unidecode(tweet):

    tweet = re.sub(r"#\w+", "", tweet)
    tweet = unidecode(tweet)
    tweet = re.sub(r"\s+", " ", tweet)
    return tweet

In [17]:
clean_tweet = clean_tweet_with_unidecode(tweet)

display(tweet)
display(clean_tweet)

'INi adalah contoh tweet dengan #hashtag dan 🗿 emoji'

'INi adalah contoh tweet dengan dan emoji'

# Text Cleaning

In [18]:
english_text = "I am reading a book in the park."
indonesian_text = "Saya sedang membaca buku di taman."

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

def remove_stopword_english(text):

    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [20]:
eng_stopwords_removal = remove_stopword_english(english_text)

display(english_text)
display(eng_stopwords_removal)

'I am reading a book in the park.'

'reading book park .'

In [21]:
def remove_stopword_indonesian(text):

    stop_words = set(stopwords.words('indonesian'))
    tokens = word_tokenize(text)
    filtered_text = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_text)
    return cleaned_text

In [22]:
id_stopwords_removal = remove_stopword_indonesian(indonesian_text)

display(indonesian_text)
display(id_stopwords_removal)

'Saya sedang membaca buku di taman.'

'membaca buku taman .'

# Remove Punctuation

In [25]:
def remove_punctuation(text):

    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

In [26]:
punctuation_removal = remove_punctuation(indonesian_text)

display(indonesian_text)
display(punctuation_removal)

'Saya sedang membaca buku di taman.'

'Saya sedang membaca buku di taman'

# Remove Special Character

In [27]:
text = "NLP seru banget! tapi susah juga ya~"

In [28]:
def remove_special_character(text):

    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

In [29]:
special_character_removal = remove_special_character(text)

display(text)
display(special_character_removal)

'NLP seru banget! tapi susah juga ya~'

'NLP seru banget tapi susah juga ya'

# Remove emoji dan emoticon

In [32]:
tweet = "😃Ini adalah contoh tweet dengan 😃 😃 😃 emoji!😃👌"

In [33]:
def remove_emoji_emoticon(text):
    # Kode untuk menghapus emoji dan emoticon
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "]+", flags=re.UNICODE)
    cleaned_text = emoji_pattern.sub(r'', text)
    return cleaned_text

In [34]:
emoji_emoticon_removal = remove_special_character(tweet)

display(tweet)
display(emoji_emoticon_removal)

'😃Ini adalah contoh tweet dengan 😃 😃 😃 emoji!😃👌'

'Ini adalah contoh tweet dengan    emoji'

# Conver emoji dan emoticon

In [35]:
def convert_emoji_emoticon(text, emoji_dict):

    for emoticon, replacment in emoji_dict.items():
        text = text.replace(emoticon, replacment)
    return text

In [36]:
emoji_dict = {
    "😀": "grinning_face",
    "😃": "smiling_face_with_open_mouth",
    "🥱": "yawning_face",
    "👌": "ok_hand",
    "👍": "thumbs_up",
    "👎": "thumbs_down",
    "🙏": "folded_hands"
}

emoji_emoticon_convertion = convert_emoji_emoticon(tweet, emoji_dict)

display(tweet)
display(emoji_emoticon_convertion)

'😃Ini adalah contoh tweet dengan 😃 😃 😃 emoji!😃👌'

'smiling_face_with_open_mouthIni adalah contoh tweet dengan smiling_face_with_open_mouth smiling_face_with_open_mouth smiling_face_with_open_mouth emoji!smiling_face_with_open_mouthok_hand'

# Case Folding"

In [37]:
text = "Ini adalah sebuah text untuk contoh CASE FOLDING"

In [38]:
display(text)
display(text.lower())
display(text.upper())
display(text.title())

'Ini adalah sebuah text untuk contoh CASE FOLDING'

'ini adalah sebuah text untuk contoh case folding'

'INI ADALAH SEBUAH TEXT UNTUK CONTOH CASE FOLDING'

'Ini Adalah Sebuah Text Untuk Contoh Case Folding'

# text normaliazation

## stemming

In [39]:
from nltk.stem import PorterStemmer

def stemming_english(text):
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

In [40]:
stemming_eng = stemming_english(english_text)

display(english_text)
display(stemming_eng)

'I am reading a book in the park.'

'i am read a book in the park.'

# Lemmatization

In [41]:
text = "I am running in the park and playing with runners."

In [44]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatozation_english(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return lemmatized_text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hammam\AppData\Roaming\nltk_data...


In [45]:
lemmatized_eng = lemmatozation_english(text)

display(text)
display(lemmatized_eng)

'I am running in the park and playing with runners.'

'I am running in the park and playing with runner .'

# Text Tokenization

## word tokenization

In [46]:
from nltk.tokenize import word_tokenize

def word_tokenization(text):

    tokens = word_tokenize(text)
    return tokens

In [47]:
word_tokenized = word_tokenization(text)

display(text)
display(word_tokenized)

'I am running in the park and playing with runners.'

['I',
 'am',
 'running',
 'in',
 'the',
 'park',
 'and',
 'playing',
 'with',
 'runners',
 '.']

# Sentence tokenization

In [49]:
from nltk.tokenize import sent_tokenize

text = "Cats are cute and playful. They like to run and jump."

def sentence_tokenization(text):

    sentences = sent_tokenize(text)
    return sentences

In [50]:
sent_toknized = sentence_tokenization(text)

display(text)
display(sent_toknized)

'Cats are cute and playful. They like to run and jump.'

['Cats are cute and playful.', 'They like to run and jump.']

# Regex Tokenization

In [51]:
from nltk.tokenize import RegexpTokenizer

text = "The price of the item is $20.50. It's a great deal! Buy now for $15 only. Limited time offer!"

def regex_tokenization(text):
    tokenizer = RegexpTokenizer(r'\$\d+\.?\d*')
    dollar_ammounts = tokenizer.tokenize(text)
    return dollar_ammounts


In [52]:
regex_tokenized = regex_tokenization(text)

display(text)
display(regex_tokenized)

"The price of the item is $20.50. It's a great deal! Buy now for $15 only. Limited time offer!"

['$20.50', '$15']

# Tweet Tokenization

In [53]:
tweet = "Ini adalah contoh tweet dentgan #hashtag dan 🗿 emoji"

In [55]:
def tweet_tokenization(tweet):
    tokens = nltk.tokenize.TweetTokenizer().tokenize(tweet)
    return tokens

In [58]:
tweet_tokenized = tweet_tokenization(tweet)

display(tweet)
print(word_tokenize(tweet))
display(tweet_tokenized)

'Ini adalah contoh tweet dentgan #hashtag dan 🗿 emoji'

['Ini', 'adalah', 'contoh', 'tweet', 'dentgan', '#', 'hashtag', 'dan', '🗿', 'emoji']


['Ini',
 'adalah',
 'contoh',
 'tweet',
 'dentgan',
 '#hashtag',
 'dan',
 '🗿',
 'emoji']

# N-grams Tokenization 

In [59]:
text = "cats are cute and playful. They like to run and jump."

In [60]:
def ngram_tokenization(text, n):

    tokens = nltk.ngrams(text.split(), n)
    return list(tokens)

In [61]:
ngram_tokenized = ngram_tokenization(text, 3)

display(text)
print(word_tokenize(text))
display(ngram_tokenized)

'cats are cute and playful. They like to run and jump.'

['cats', 'are', 'cute', 'and', 'playful', '.', 'They', 'like', 'to', 'run', 'and', 'jump', '.']


[('cats', 'are', 'cute'),
 ('are', 'cute', 'and'),
 ('cute', 'and', 'playful.'),
 ('and', 'playful.', 'They'),
 ('playful.', 'They', 'like'),
 ('They', 'like', 'to'),
 ('like', 'to', 'run'),
 ('to', 'run', 'and'),
 ('run', 'and', 'jump.')]