# re - Demo

In [1]:
import re

In [7]:
def clean_test(sentence):
    return re.sub(r'([^\s\w]|_)+', ' ', sentence).split()

In [11]:
sentence = '''Regular expressions are powerful for pattern matching, text parsing, 
and tokenization, making them a valuable tool when dealing with text data.'''
sentence

'Regular expressions are powerful for pattern matching, text parsing, \nand tokenization, making them a valuable tool when dealing with text data.'

In [15]:
clean_test(sentence)

['Regular',
 'expressions',
 'are',
 'powerful',
 'for',
 'pattern',
 'matching',
 'text',
 'parsing',
 'and',
 'tokenization',
 'making',
 'them',
 'a',
 'valuable',
 'tool',
 'when',
 'dealing',
 'with',
 'text',
 'data']

In [19]:
sentence2 = clean_test(sentence)

In [21]:
sentence2

['Regular',
 'expressions',
 'are',
 'powerful',
 'for',
 'pattern',
 'matching',
 'text',
 'parsing',
 'and',
 'tokenization',
 'making',
 'them',
 'a',
 'valuable',
 'tool',
 'when',
 'dealing',
 'with',
 'text',
 'data']

# Data Collection

In [46]:
def n_gram_extractor(sentence, n):
    tokens = re.sub(r'([^\s\w]|_)+', ' ', sentence).split()
    for i in range(len(tokens)-n+1):
        print(tokens[i:i+n])

In [42]:
n_gram_extractor(sentence,5)

['Regular', 'expressions', 'are', 'powerful', 'for']
['expressions', 'are', 'powerful', 'for', 'pattern']
['are', 'powerful', 'for', 'pattern', 'matching']
['powerful', 'for', 'pattern', 'matching', 'text']
['for', 'pattern', 'matching', 'text', 'parsing']
['pattern', 'matching', 'text', 'parsing', 'and']
['matching', 'text', 'parsing', 'and', 'tokenization']
['text', 'parsing', 'and', 'tokenization', 'making']
['parsing', 'and', 'tokenization', 'making', 'them']
['and', 'tokenization', 'making', 'them', 'a']
['tokenization', 'making', 'them', 'a', 'valuable']
['making', 'them', 'a', 'valuable', 'tool']
['them', 'a', 'valuable', 'tool', 'when']
['a', 'valuable', 'tool', 'when', 'dealing']
['valuable', 'tool', 'when', 'dealing', 'with']
['tool', 'when', 'dealing', 'with', 'text']
['when', 'dealing', 'with', 'text', 'data']


In [44]:
n_gram_extractor(sentence, 3)

['Regular', 'expressions', 'are']
['expressions', 'are', 'powerful']
['are', 'powerful', 'for']
['powerful', 'for', 'pattern']
['for', 'pattern', 'matching']
['pattern', 'matching', 'text']
['matching', 'text', 'parsing']
['text', 'parsing', 'and']
['parsing', 'and', 'tokenization']
['and', 'tokenization', 'making']
['tokenization', 'making', 'them']
['making', 'them', 'a']
['them', 'a', 'valuable']
['a', 'valuable', 'tool']
['valuable', 'tool', 'when']
['tool', 'when', 'dealing']
['when', 'dealing', 'with']
['dealing', 'with', 'text']
['with', 'text', 'data']


In [56]:
import nltk
from nltk import ngrams

In [60]:
list(ngrams(sentence.split(),2))

[('Regular', 'expressions'),
 ('expressions', 'are'),
 ('are', 'powerful'),
 ('powerful', 'for'),
 ('for', 'pattern'),
 ('pattern', 'matching,'),
 ('matching,', 'text'),
 ('text', 'parsing,'),
 ('parsing,', 'and'),
 ('and', 'tokenization,'),
 ('tokenization,', 'making'),
 ('making', 'them'),
 ('them', 'a'),
 ('a', 'valuable'),
 ('valuable', 'tool'),
 ('tool', 'when'),
 ('when', 'dealing'),
 ('dealing', 'with'),
 ('with', 'text'),
 ('text', 'data.')]

In [64]:
list(ngrams(sentence.split(),5))

[('Regular', 'expressions', 'are', 'powerful', 'for'),
 ('expressions', 'are', 'powerful', 'for', 'pattern'),
 ('are', 'powerful', 'for', 'pattern', 'matching,'),
 ('powerful', 'for', 'pattern', 'matching,', 'text'),
 ('for', 'pattern', 'matching,', 'text', 'parsing,'),
 ('pattern', 'matching,', 'text', 'parsing,', 'and'),
 ('matching,', 'text', 'parsing,', 'and', 'tokenization,'),
 ('text', 'parsing,', 'and', 'tokenization,', 'making'),
 ('parsing,', 'and', 'tokenization,', 'making', 'them'),
 ('and', 'tokenization,', 'making', 'them', 'a'),
 ('tokenization,', 'making', 'them', 'a', 'valuable'),
 ('making', 'them', 'a', 'valuable', 'tool'),
 ('them', 'a', 'valuable', 'tool', 'when'),
 ('a', 'valuable', 'tool', 'when', 'dealing'),
 ('valuable', 'tool', 'when', 'dealing', 'with'),
 ('tool', 'when', 'dealing', 'with', 'text'),
 ('when', 'dealing', 'with', 'text', 'data.')]

In [66]:
list(ngrams(sentence.split(),2))

[('Regular', 'expressions'),
 ('expressions', 'are'),
 ('are', 'powerful'),
 ('powerful', 'for'),
 ('for', 'pattern'),
 ('pattern', 'matching,'),
 ('matching,', 'text'),
 ('text', 'parsing,'),
 ('parsing,', 'and'),
 ('and', 'tokenization,'),
 ('tokenization,', 'making'),
 ('making', 'them'),
 ('them', 'a'),
 ('a', 'valuable'),
 ('valuable', 'tool'),
 ('tool', 'when'),
 ('when', 'dealing'),
 ('dealing', 'with'),
 ('with', 'text'),
 ('text', 'data.')]

# Feature Extraction methods

- Bag of words
- Distributed Representation
- Word Embeddings (Word2Vec, GloVe, FastText)
- TF-IDF (Term Frequency - Inverse Document Frequency)
- N-grams
- Part-of-Speech (POS) Tagging
- Named Entity Recognition (NER)
- Topic Modeling (LDA)
- Dependency Parsing
- Frequency vector
- one-hot encoding

# Tokenizers - Demo

In [148]:
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
sentence = '''Love to learn NLP(Natural Programming Language) in 2024 for career advancement in AI!!!'''

In [103]:
def tokenize_with_tweektokenizer(text):
    tweet_tokenizer = TweetTokenizer()
    return tweet_tokenizer.tokenize(text)

In [105]:
tokenize_with_tweektokenizer(sentence)

['Love',
 'to',
 'learn',
 'NLP',
 'in',
 '2024',
 'for',
 'career',
 'advancement',
 'in',
 'AI',
 '!',
 '!',
 '!']

In [107]:
def tokenize_With_mwe(text):
    mwe_tokenize = MWETokenizer([('Republic', 'Day')])
    mwe_tokenize.add_mwe(('Indian', 'Army'))
    return mwe_tokenize.tokenize(text.split())

In [109]:
tokenize_With_mwe(sentence)

['Love',
 'to',
 'learn',
 'NLP',
 'in',
 '2024',
 'for',
 'career',
 'advancement',
 'in',
 'AI!!!']

In [113]:
def tokenize_with_regexp(text):
    reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\s+')
    return reg_tokenizer.tokenize(text)

In [115]:
tokenize_with_regexp(sentence)

[' ',
 'Love',
 ' ',
 'to',
 ' ',
 'learn',
 ' ',
 'NLP',
 ' ',
 'in',
 ' ',
 '2024',
 ' ',
 'for',
 ' ',
 'career',
 ' ',
 'advancement',
 ' ',
 'in',
 ' ',
 'AI']

In [117]:
def tokenize_with_wst(text):
    wst_tokenizer = WhitespaceTokenizer()
    return wst_tokenizer.tokenize(text)

In [119]:
tokenize_with_wst(sentence)

['Love',
 'to',
 'learn',
 'NLP',
 'in',
 '2024',
 'for',
 'career',
 'advancement',
 'in',
 'AI!!!']

In [121]:
def tokenize_with_wordpunct(text):
    wordpunct_tokenizer = WordPunctTokenizer()
    return wordpunct_tokenizer.tokenize(text)

In [125]:
tokenize_with_wordpunct(sentence)

['Love',
 'to',
 'learn',
 'NLP',
 'in',
 '2024',
 'for',
 'career',
 'advancement',
 'in',
 'AI',
 '!!!']

In [131]:
len(tokenize_with_tweektokenizer(sentence))

14

# stemming

In [150]:
from nltk import RegexpStemmer

In [152]:
sentence

'Love to learn NLP(Natural Programming Language) in 2024 for career advancement in AI!!!'

In [154]:
def get_stemms(text):
    reg_stem = RegexpStemmer('ing$', min=4)
    #any string ending with regex 'ing' will be removed. below code will return by removing the ing in the sentence.
    return ' '.join(reg_stem.stem(wd) for wd in text.split())

In [158]:
get_stemms(sentence)

'Love to learn NLP(Natural Programm Language) in 2024 for career advancement in AI!!!'

In [162]:
from nltk.stem.porter import *

In [164]:
def get_stemms(text):
    port_stemm = PorterStemmer()
    return ' '.join(port_stemm.stem(wd) for wd in text.split())

In [166]:
get_stemms(sentence)

'love to learn nlp(natur program language) in 2024 for career advanc in ai!!!'

In [172]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\udaya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [200]:
sentence

'Love to learn NLP(Natural Programming Language) in 2024 for career advancement in AI!!!'

In [202]:
lemmitizer = WordNetLemmatizer()
def get_lemmas(text):
    return ' '.join(lemmitizer.lemmatize(word) for word in word_tokenize(text))

In [204]:
get_lemmas(sentence)

'Love to learn NLP ( Natural Programming Language ) in 2024 for career advancement in AI ! ! !'

In [206]:
from nltk import download
download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\udaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [208]:
stop_words = stopwords.words('english')
len(sentence)

87

In [216]:
def remove_stop_words(text, stop_words):
    return ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words])

In [218]:
len(remove_stop_words(sentence, stop_words))

80

In [220]:
remove_stop_words(sentence, stop_words)

'Love learn NLP ( Natural Programming Language ) 2024 career advancement AI ! ! !'

# Vectorization - Demo

<h3> Feature extraction </h3>

In [240]:
import pandas as pd
import nltk
nltk.download('tagsets')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter
from nltk.data import load

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\udaya\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [252]:
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())

tag_list = get_tagsets()
print(tag_list)

['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']


In [282]:
def get_pos_occu_freq(data, tag_list):
    text_list = data.text
    feature_df = pd.DataFrame(columns=tag_list)
    for text_line in text_list:
        pos_tags = [j for i,j in pos_tag(word_tokenize(text_line))]
    
        row = dict(Counter(pos_tags))
        feature_df = feature_df.append(row, ignore_index = True)

    feature_df.fillna(0, inplace=True)
    return feature_df

In [268]:
pd.options.display.max_columns=70

In [276]:
tag_list = get_tagsets()
data = pd.read_csv('data.csv', header=0)
data.head()

Unnamed: 0,text
0,Word-based tokenization is sufficient for many...
1,"Subword-based tokenization (e.g., BPE, WordPie..."
2,Sentence tokenization is useful when sentence-...
3,Character tokenization is useful in tasks wher...


In [284]:
feature_df = get_pos_occu_freq(data, tag_list)
feature_df.head()

AttributeError: 'DataFrame' object has no attribute 'append'

<h3> bag of words</h3>

In [288]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [290]:
def vectorize_text(corpus):
    bag_of_words_model = CountVectorizer()

    dense_vec_matrix = bag_of_words_model.fit_transform(corpus).todense()
    bag_of_words_df = pd.DataFrame(dense_vec_matrix)
    bag_of_words_df.columns = sorted(bag_of_words_model.vocabulary_)
    return bag_of_words_df

In [294]:
corpus = [
'''Word-based tokenization is sufficient for many applications like sentiment analysis, topic modeling, etc.''',
'''Subword-based tokenization (e.g., BPE, WordPiece) is essential for transformer models (BERT, GPT) because it handles out-of-vocabulary words effectively.''',
'''Sentence tokenization is useful when sentence-level understanding is needed, such as in text summarization.''',
'''Character tokenization is useful in tasks where very fine granularity is needed, like handling noisy text or typos.'''
]
len(corpus)

4

In [300]:
df = vectorize_text(corpus)
df.head()

Unnamed: 0,analysis,applications,as,based,because,bert,bpe,character,effectively,essential,etc,fine,for,gpt,granularity,handles,handling,in,is,it,level,like,many,modeling,models,needed,noisy,of,or,out,sentence,sentiment,subword,such,sufficient,summarization,tasks,text,tokenization,topic,transformer,typos,understanding,useful,very,vocabulary,when,where,word,wordpiece,words
0,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,1,1,1,1,0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,1,0,0,0,0,1,0,0,0,0,2,0,0,1,0,1,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,1,2,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,1,0,0,1,0,1,1,0,0,1,0,0,0


In [308]:
def bow_top_n(corpus, n):
    bag_of_words_model_small = CountVectorizer(max_features=n)

    dense_vec_matrix_small = bag_of_words_model_small.fit_transform(corpus).todense()
    bag_of_words_df_small = pd.DataFrame(dense_vec_matrix_small)
    bag_of_words_df_small.columns = sorted(bag_of_words_model_small.vocabulary_)
    return bag_of_words_df_small

df_2 = bow_top_n(corpus, 10)
df_2.head()

Unnamed: 0,based,for,in,is,like,needed,sentence,text,tokenization,useful
0,1,1,0,1,1,0,0,0,1,0
1,1,1,0,1,0,0,0,0,1,0
2,0,0,1,2,0,1,2,1,1,1
3,0,0,1,2,1,1,0,1,1,1


<h3> TF - IDF(Term Frequency - Inverse Document Frequency) </h3>

In [313]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [315]:
def tf_idf_vectorizer(corpus):
    tfidf_model = TfidfVectorizer()
    vector_list = tfidf_model.fit_transform(corpus).todense()
    return vector_list

In [319]:
vector_list = tf_idf_vectorizer(corpus)
print(vector_list)

[[0.29605221 0.29605221 0.         0.23341096 0.         0.
  0.         0.         0.         0.         0.29605221 0.
  0.23341096 0.         0.         0.         0.         0.
  0.15449233 0.         0.         0.23341096 0.29605221 0.29605221
  0.         0.         0.         0.         0.         0.
  0.         0.29605221 0.         0.         0.29605221 0.
  0.         0.         0.15449233 0.29605221 0.         0.
  0.         0.         0.         0.         0.         0.
  0.29605221 0.         0.        ]
 [0.         0.         0.         0.1869354  0.23710385 0.23710385
  0.23710385 0.         0.23710385 0.23710385 0.         0.
  0.1869354  0.23710385 0.         0.23710385 0.         0.
  0.12373063 0.23710385 0.         0.         0.         0.
  0.23710385 0.         0.         0.23710385 0.         0.23710385
  0.         0.         0.23710385 0.         0.         0.
  0.         0.         0.12373063 0.         0.23710385 0.
  0.         0.         0.         0.237