# Text Extraction

## HTML

In [None]:
html_text = "<html><body><h1>Hello, World!\nHello you!</h1></body></html>"

In [None]:
from bs4 import BeautifulSoup
def extract_from_html_using_parser(html_text):
    # Menggunakan library BeautifulSoup untuk mengekstraksi teks dari HTML
    soup = BeautifulSoup(html_text, 'html.parser')
    extracted_text = soup.get_text()
    return extracted_text

In [None]:
html_parser_result = extract_from_html_using_parser(html_text)
display(html_parser_result)

'Hello, World!\nHello you!'

In [None]:
import re

def get_words_only(text):
    # Menggunakan regular expression untuk menghapus karakter non-kata seperti tanda baca dan angka
    cleaned_text = re.sub(r'[^\w\s]', '', text)

    # Menggunakan split() untuk memisahkan kata-kata berdasarkan spasi
    words = cleaned_text.split()
    return words


In [None]:
only_extract_text = get_words_only(html_text)
display(only_extract_text)

['htmlbodyh1Hello', 'World', 'Hello', 'youh1bodyhtml']

In [None]:
def clean_html(raw_html):
  CLEANR = re.compile('<.*?>')

  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [None]:
cleaned_html_tags = clean_html(html_text)
display(cleaned_html_tags)

'Hello, World!\nHello you!'

## Tweet

In [None]:
tweet = "Ini adalah contoh tweet dengan #hashtag dan 😃 emoji!"

In [None]:
import re

def clean_tweet(tweet):
    # Remove hashtags
    tweet = re.sub(r"#\w+", "", tweet)

    # Remove non-alphanumeric characters
    tweet = re.sub(r"[^\w\s]+", "", tweet)

    # Remove extra spaces
    tweet = re.sub(r"\s+", " ", tweet)

    return tweet.strip()

In [None]:
# Clean the tweet text
cleaned_tweet = clean_tweet(tweet)

display(tweet)
display(cleaned_tweet)

'Ini adalah contoh tweet dengan #hashtag dan 😃 emoji!'

'Ini adalah contoh tweet dengan dan emoji'

In [None]:
# !pip install unidecode
from unidecode import unidecode

def clean_tweet_with_unidecode(tweet):
    # Remove hashtags
    tweet = re.sub(r"#\w+", "", tweet)

    # convert to ASCII representation
    tweet = unidecode(tweet)

    # Remove extra spaces
    tweet = re.sub(r"\s+", " ", tweet)

    return tweet


In [None]:
clean_tweet = clean_tweet_with_unidecode(tweet)

display(tweet)
display(clean_tweet)

'Ini adalah contoh tweet dengan #hashtag dan 😃 emoji!'

'Ini adalah contoh tweet dengan dan emoji!'

**Notes:** Ada banyak jalan menuju roma~

# Text Cleaning

## Remove stopwords

In [None]:
english_text = "I am reading a book in the park."
indonesian_text = "Saya sedang membaca buku di taman."

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')

def remove_stopwords_english(text):
    # Kode untuk menghapus stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [None]:
eng_stopwords_removal = remove_stopwords_english(english_text)

display(english_text)
display(eng_stopwords_removal)

'I am reading a book in the park.'

'reading book park .'

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')

def remove_stopwords_indonesian(text):
    # Kode untuk menghapus stopwords
    stop_words = set(stopwords.words('indonesian'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [None]:
id_stopwords_removal = remove_stopwords_indonesian(indonesian_text)

display(indonesian_text)
display(id_stopwords_removal)

'Saya sedang membaca buku di taman.'

'membaca buku taman .'

## Remove punctuation

In [None]:
def remove_punctuation(text):
    # Kode untuk menghapus tanda baca
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

In [None]:
punctuation_removal = remove_punctuation(indonesian_text)

display(indonesian_text)
display(punctuation_removal)

'Saya sedang membaca buku di taman.'

'Saya sedang membaca buku di taman'

## Remove special character

In [None]:
text = "NLP seru banget! tapi susah juga ya~"

In [None]:
def remove_special_character(text):
    # Kode untuk menghapus karakter khusus
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

In [None]:
special_character_removal = remove_special_character(text)

display(text)
display(special_character_removal)

'NLP seru banget! tapi susah juga ya~'

'NLP seru banget tapi susah juga ya'

## Remove emoji & emoticon

In [None]:
tweet = "😃Ini adalah contoh tweet dengan 😃 😃 😃 emoji!😃👌"

In [None]:
def remove_emoji_emoticon(text):
    # Kode untuk menghapus emoji dan emoticon
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "]+", flags=re.UNICODE)
    cleaned_text = emoji_pattern.sub(r'', text)
    return cleaned_text

In [None]:
emoji_emoticon_removal = remove_special_character(tweet)

display(tweet)
display(emoji_emoticon_removal)

'😃Ini adalah contoh tweet dengan 😃 😃 😃 emoji!😃👌'

'Ini adalah contoh tweet dengan    emoji'

## Convert emoji & emoticon

In [None]:
def convert_emoji_emoticon(text, emoji_dict):
    # Kode untuk mengonversi emoji dan emoticon berdasarkan dictionary
    for emoticon, replacement in emoji_dict.items():
        text = text.replace(emoticon, replacement)
    return text

In [None]:
emoji_dict = {
    "😀": "grinning_face",
    "😃": "smiling_face_with_open_mouth",
    "🥱": "yawning_face",
    "👌": "ok_hand",
    "👍": "thumbs_up",
    "👎": "thumbs_down",
    "🙏": "folded_hands"
}

emoji_emoticon_convertion = convert_emoji_emoticon(tweet, emoji_dict)

display(tweet)
display(emoji_emoticon_convertion)

'😃Ini adalah contoh tweet dengan 😃 😃 😃 emoji!😃👌'

'smiling_face_with_open_mouthIni adalah contoh tweet dengan smiling_face_with_open_mouth smiling_face_with_open_mouth smiling_face_with_open_mouth emoji!smiling_face_with_open_mouthok_hand'

## Case folding

In [None]:
text = "Ini adalah sebuah text untuk contoh CASE FOLDING."

In [None]:
display(text)
display(text.lower()) # lower
display(text.upper()) # to upper
display(text.title()) # to title

'Ini adalah sebuah text untuk contoh CASE FOLDING.'

'ini adalah sebuah text untuk contoh case folding.'

'INI ADALAH SEBUAH TEXT UNTUK CONTOH CASE FOLDING.'

'Ini Adalah Sebuah Text Untuk Contoh Case Folding.'

# Text Normalization

## Stemming

In [None]:
from nltk.stem import PorterStemmer

def stemming_english(text):
    # Create an instance of the PorterStemmer
    stemmer = PorterStemmer()

    # Tokenize the text into individual words
    words = text.split()

    # Apply stemming to each word
    stemmed_words = [stemmer.stem(word) for word in words]

    # Join the stemmed words back into a single string
    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

In [None]:
stemming_eng = stemming_english(english_text)

display(english_text)
display(stemming_eng)

'I am reading a book in the park.'

'i am read a book in the park.'

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
#!pip install Sastrawi
def stemming_indonesian_sastrawi(text):
    # Create an instance of the stemmer
    stemmer = StemmerFactory().create_stemmer()

    # Perform stemming
    stemmed_text = stemmer.stem(text)

    return stemmed_text

In [None]:
stemmed_text = stemming_indonesian_sastrawi(indonesian_text)

display(indonesian_text)
display(stemmed_text)

'Saya sedang membaca buku di taman.'

'saya sedang baca buku di taman'

## Lemmatization

In [None]:
text = "I am running in the park and playing with runners."

In [None]:
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
def lemmatization_english(text):
    # Kode untuk melakukan lemmatization bahasa Inggris
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return lemmatized_text

In [None]:
lemmatized_eng = lemmatization_english(text)

display(text)
display(lemmatized_eng)

'I am running in the park and playing with runners.'

'I am running in the park and playing with runner .'

# Text Tokenization

## Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize
def word_tokenization(text):
    # Kode untuk melakukan word tokenization
    tokens = word_tokenize(text)
    return tokens

In [None]:
word_tokenized = word_tokenization(text)

display(text)
display(word_tokenized)

'I am running in the park and playing with runners.'

['I',
 'am',
 'running',
 'in',
 'the',
 'park',
 'and',
 'playing',
 'with',
 'runners',
 '.']

## Sentence Tokenization

In [None]:
text = "Cats are cute and playful. They like to run and jump."

In [None]:
from nltk.tokenize import sent_tokenize
def sentence_tokenization(text):
    # Kode untuk melakukan sentence tokenization
    sentences = sent_tokenize(text)
    return sentences

In [None]:
sent_tokenized = sentence_tokenization(text)

display(text)
display(sent_tokenized)

'Cats are cute and playful. They like to run and jump.'

['Cats are cute and playful.', 'They like to run and jump.']

## Regex Tokenization

In [None]:
text = "The price of the item is $20.50. It's a great deal! Buy now for $15 only. Limited time offer!"

In [None]:
from nltk.tokenize import RegexpTokenizer
def regex_tokenization(text):
    tokenizer = RegexpTokenizer(r'\$\d+\.?\d*')
    dollar_amounts = tokenizer.tokenize(text)
    return dollar_amounts

In [None]:
regex_tokenized = regex_tokenization(text)

display(text)
print(regex_tokenized)

"The price of the item is $20.50. It's a great deal! Buy now for $15 only. Limited time offer!"

['$20.50', '$15']


## Tweet Tokenization

In [None]:
tweet = "Ini adalah contoh tweet dengan #hashtag dan 😃 emoji!"

In [None]:
def tweet_tokenization(text):
    # Kode untuk melakukan tweet tokenization
    tokens = nltk.tokenize.TweetTokenizer().tokenize(text)
    return tokens

In [None]:
tweet_tokenized = tweet_tokenization(tweet)

display(tweet)
print(word_tokenize(tweet))
print(tweet_tokenized)

'Ini adalah contoh tweet dengan #hashtag dan 😃 emoji!'

['Ini', 'adalah', 'contoh', 'tweet', 'dengan', '#', 'hashtag', 'dan', '😃', 'emoji', '!']
['Ini', 'adalah', 'contoh', 'tweet', 'dengan', '#hashtag', 'dan', '😃', 'emoji', '!']


## N-grams Tokenization

In [None]:
text = "Cats are cute and playful. They like to run and jump."

In [None]:
def ngram_tokenization(text, n):
    # Kode untuk melakukan n-gram tokenization
    tokens = nltk.ngrams(text.split(), n)
    return list(tokens)

In [None]:
ngram_tokenized = ngram_tokenization(text, 3)

display(text)
print(word_tokenize(text))
print(ngram_tokenized)

'Cats are cute and playful. They like to run and jump.'

['Cats', 'are', 'cute', 'and', 'playful', '.', 'They', 'like', 'to', 'run', 'and', 'jump', '.']
[('Cats', 'are', 'cute'), ('are', 'cute', 'and'), ('cute', 'and', 'playful.'), ('and', 'playful.', 'They'), ('playful.', 'They', 'like'), ('They', 'like', 'to'), ('like', 'to', 'run'), ('to', 'run', 'and'), ('run', 'and', 'jump.')]
