### Regular Expressions

### Regular expressions helps us to find the pattern in the given string which helps to tackle the classification problems


In [99]:
import re 

In [100]:
mystring = 'sujal'
print(re.match(mystring, 'sujal adhikari is the man')) 
## Here the match is sujal or sujal is the common between the two strings 


<re.Match object; span=(0, 5), match='sujal'>


In [101]:
used_regex = '\w+'
print(re.match(used_regex, "Peter Fernandez"))
## Here only peter is printed becuase the space is not the string or the character

<re.Match object; span=(0, 5), match='Peter'>


---
### Common regex patterns 


In [102]:
## There are tons of patterns in the regex but the most common are 
#print(re.match('\w+', 'Sujal Adhikari ')) ## Gives all the strings before the character that is not space
print(re.match('\d+', 'DOB 2005'))

None


## Tokenization 

### Tokenization is the process of breaking down the strings into chunks in order to make it easy for the strings 

In [103]:
## Using the normal regex library 
my_string = "Hey there fellas ! Sujal Adhikari here!"
print(re.findall(r'\w+', my_string)) ## So basically, it turns the whole sentence into the tokens 


['Hey', 'there', 'fellas', 'Sujal', 'Adhikari', 'here']


### Regex or regular expressions only work on strings 

### NLTK -> Natural Language Toolkit 
### It is one of the handy library which is used to do tokenization, stemming, lemmatization, parsing and many more 

In [104]:
number = "My phone number is 8627038353"
expression = r'\d{10}'
print(re.search(expression,number)) ## The output is the number 

<re.Match object; span=(19, 29), match='8627038353'>


In [105]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize,regexp_tokenize
scene_one = 'Peter is the man who everyone admires. He is the man'
nltk.download('punkt_tab') 
# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)

# Use word_tokenize to tokenize the fourth sentence: tokenized_sent
tokenized_sent = word_tokenize(sentences[1])

# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(scene_one))

# Print the unique tokens result
print(unique_tokens)

{'admires', 'He', 'the', 'who', 'Peter', '.', 'everyone', 'is', 'man'}


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sujaladhikari/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Regex groupping using the letter '|'


In [106]:
match_digits_and_words = ('(\d+|\w+)')
print(re.findall(match_digits_and_words, 'He has 11 cats'))

['He', 'has', '11', 'cats']


### Some useful tokenization 

In [107]:
string_to_be_tokenized = 'NAME: SUJAL ADHIKARI, Age: 20, Twenty.'
print(re.findall('[A-Za-z]+',string_to_be_tokenized)) ## These are the ones with the upper and lower case in it

['NAME', 'SUJAL', 'ADHIKARI', 'Age', 'Twenty']


In [108]:
## Lets go more complex 
print(re.findall('[A-Za-z\.\:\d]+', string_to_be_tokenized)) ## This gives every thing 

['NAME:', 'SUJAL', 'ADHIKARI', 'Age:', '20', 'Twenty.']


In [109]:
## Task 1: Retain sentence punctuation as separate tokens, but have '#1' remain a single token.
my_string = "SOLDIER #1: Found them? In Mercea? The coconut's tropical!"
pattern = r'(\w+|#\d|\?|!)'
print(re.findall(pattern, my_string))


['SOLDIER', '#1', 'Found', 'them', '?', 'In', 'Mercea', '?', 'The', 'coconut', 's', 'tropical', '!']


In [110]:
german_text = 'Wann gehen wir Pizza essen? 🍕 Und fährst du mit Über? 🚕'
capital_words = r"['A-ZÜä]\w+"
print(regexp_tokenize(german_text, capital_words))

['Wann', 'Pizza', 'Und', 'ährst', 'Über']


### Few Regex Questions 


In [111]:
text = "Hello World! This is NLP 101. Welcome to Python"
## Task : using re.findll() to extract only text
method = r'[A-Za-z]+'
print(re.findall(method,text))

['Hello', 'World', 'This', 'is', 'NLP', 'Welcome', 'to', 'Python']


In [112]:
text = "Über das Übungsheft: Heute lernen wir NLP und Python."
## Capitalize words with the unicode
method = r'\b[ÜA-Z][a-zA-Züäß]*\b'
print(re.findall(method, text))


['Über', 'Übungsheft', 'Heute', 'NLP', 'Python']


In [113]:
script = """Peter: Hello! How are you?
Anna: I'm good, thanks! And you?
Peter: Doing great. Want to grab some pizza? 🍕"""

## Splitting the whole script into the lines

lines = script.split('\n')
tokens = [nltk.word_tokenize(l) for l in lines]
print(tokens)

[['Peter', ':', 'Hello', '!', 'How', 'are', 'you', '?'], ['Anna', ':', 'I', "'m", 'good', ',', 'thanks', '!', 'And', 'you', '?'], ['Peter', ':', 'Doing', 'great', '.', 'Want', 'to', 'grab', 'some', 'pizza', '?', '🍕']]


In [114]:
### Extracting the unique letters only 
method = r'[A-Za-zÜ]+|[\U0001F300-\U0001F5FF]'
print(re.findall(method,script))

['Peter', 'Hello', 'How', 'are', 'you', 'Anna', 'I', 'm', 'good', 'thanks', 'And', 'you', 'Peter', 'Doing', 'great', 'Want', 'to', 'grab', 'some', 'pizza', '🍕']


### How to calculate the number of words or the frequency of the words !

In [115]:
### Library to be used 
from collections import Counter
text =""" In a small town surrounded by rolling hills, the mornings always began with the same rhythm: the smell of fresh bread from the bakery, the sound of bicycles clattering over cobblestones, and the chatter of neighbors who seemed to know every detail of each other’s lives. Yet, hidden in the routine was a sense of quiet anticipation, as though the town itself was waiting for something unexpected to arrive—like a letter without a return address, or a stranger who carried stories no one had heard before."""
tokenized_text = word_tokenize(text)
lower_tokenized_text = [t.lower() for t in tokenized_text]
total_count = Counter(lower_tokenized_text)
print(total_count.most_common(10))

[('the', 8), (',', 6), ('a', 5), ('of', 5), ('in', 2), ('town', 2), ('who', 2), ('to', 2), ('.', 2), ('was', 2)]


### Preprocessing the text before NLP 

In [116]:
## Lets do the preprocessing of the text before NLP 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

## The first step involves the removal of the non alpha characters

only_alpha_characters = [t for t in lower_tokenized_text if t.isalpha()]

## Then we remove the english stops such as 'a', 'the', 'and' becuase they donot carry much meaning 

removal_of_english_characters = [t for t in only_alpha_characters if t not in stopwords.words('english')]

## Then we lemmatize or we simply count the two words with the same meaning as one such as mice and mouse is considered mouse

word_lemmatizer = WordNetLemmatizer()
## Now simply what we can do is lemmantize each word 

each_lemmantized_word = [word_lemmatizer.lemmatize(t) for t in removal_of_english_characters]

## bag of words 
bow = Counter(each_lemmantized_word)
print(bow.most_common(10))


[('town', 2), ('small', 1), ('surrounded', 1), ('rolling', 1), ('hill', 1), ('morning', 1), ('always', 1), ('began', 1), ('rhythm', 1), ('smell', 1)]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sujaladhikari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sujaladhikari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sujaladhikari/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Introduction to genism 

### Terms used:
### 1. Genism: Genism is the way of making computers understand the text efficiently 
### 2. Corpus: Corpus is the collection of books, or sentences that computer can read!


In [117]:
from gensim.corpora.dictionary import Dictionary


article = """Artificial intelligence (AI) is transforming healthcare by enhancing diagnostics, personalizing treatment, and streamlining administrative tasks. Machine learning algorithms can analyze large datasets, identifying patterns that may be missed by humans. AI-powered tools can assist in early disease detection, such as identifying tumors in medical imaging, and can even predict patient outcomes. However, ethical concerns around data privacy, algorithmic bias, and the role of AI in decision-making must be carefully managed. Despite these challenges, the potential benefits of AI in healthcare are immense, promising more accurate, efficient, and accessible medical care."""


## So now from this article what we can do is simply at first divide it into the tokens
tokenized_article = word_tokenize(article)

## lets reduce the whole words by replacing the ones with the same meaning as one

tokenized_alpha = [t for t in tokenized_article if t.isalpha()] ## Only alphabetical characters

## Then we remove the common english characters such as a , the , an 

common_english_removal = [t for t in tokenized_alpha if t not in stopwords.words('english')]

## lemanitizing the word 

word_lemmatizer = WordNetLemmatizer() 

each_lemmantized_word = [word_lemmatizer.lemmatize(t) for t in common_english_removal]


bow = Counter(each_lemmantized_word)
print(bow.most_common(10))

## Other way is to use the Dictionary 

dictionary = Dictionary([each_lemmantized_word])

corpus=[dictionary.doc2bow(each_lemmantized_word)]

doc = corpus[0]
print(doc)

[('AI', 3), ('healthcare', 2), ('identifying', 2), ('medical', 2), ('Artificial', 1), ('intelligence', 1), ('transforming', 1), ('enhancing', 1), ('diagnostics', 1), ('personalizing', 1)]
[(0, 3), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1)]


### Use Counter for human understandable texts
### Use the doc2bow or dictionary for the 

---

### TfIdf Vectorizer



### Lets use the tfidf vectorizer in one random article of the Wikipedia and then figure out the weight of the article and also how much topic oriented the article is ! 



In [143]:
from gensim.models import TfidfModel

random_article = """So is this swearing or no swearing?” In a darkened soundstage on the outskirts of London, Abel Tesfaye is wondering if he can say “fuck” or not. Tesfaye, better known as breakout pop sensation the Weeknd, is at a rehearsal for Later...With Jools Holland, the BBC music show, about to soundcheck his smash hit “The Hills,” a four-minute horror-movie booty call featuring more than a dozen f-bombs. For Tesfaye, that’s relatively clean, but he knows the pensioners in Twickenham might disagree. So when the verdict comes back “no swearing,” he nods and smoothly pivots to a censored version — a small gesture that says a lot about the kind of professional he has become.

“The Hills” is currently enjoying its fourth straight week at Number One, a feat made even more impressive because it took the place of another Weeknd track, “Can’t Feel My Face” — Spotify’s official song of the summer, and the only song about cocaine ever to be lip-synced by Tom Cruise on network TV. Tesfaye is just the 12th artist in history to score back-to-back Number Ones, a group that includes Elvis Presley, the Beatles and Taylor Swift. His new album, Beauty Behind the Madness, has sold more than half a million copies in a couple of months, and he’s preparing to launch a national arena tour in November. “I’m still digesting it, to be honest with you,” Tesfaye says of his success. “But the screams keep getting louder, dude.”


Tesfaye comes over to say hi, dressed in black Levi’s and a Roots hoodie, his tsunami of hair piled high atop his head. “Sorry, I’m sick,” he says, as his handshake becomes a fist bump in midair. Since starting this promo tour a week ago, he’s been to Las Vegas, Paris, Berlin and now London. The cold caught up with him yesterday, during a signing for 500 squealing fans at the Oxford Circus HMV. (Overheard: “I wanted to hug him!” “You didn’t hug him? I kissed him!”)


Go behind the scenes of the Weeknd’s performance on Saturday Night Live in our exclusive mini-documentary below:

This scene would not have seemed possible in 2011, when the Weeknd appeared with a trio of cult-favorite mixtapes that established both his sonic template — drug-drenched, indie-rock-sampling, sex-dungeon R&B — and his mysterious, brooding persona. A press-shy Ethiopian kid from Toronto who has given only a handful of interviews, he has cultivated a near-mythical image as a bed-hopping, pill-popping, chart-topping cipher. “We live in an era when everything is so excessive, I think it’s refreshing for everybody to be like, ‘Who the fuck is this guy?'” Tesfaye says. “I think that’s why my career is going to be so long: Because I haven’t given people everything.”""" ## this is the article we will be using 


sentences = re.split(r'[.?!]', random_article)
sentences = [s.strip() for s in sentences if s.strip() != ""]
## at first lets tokenize the whole article

tokenized_article = [word_tokenize(t) for t in sentences]


## Lets remove non alphabetical characters

refined_tokenized_article = [[t for t in each if t.isalpha()] for each in tokenized_article]


## Lets remove the common english words such as no, or, the, and , an, a

removal_of_english_characters = [[t for t in each_refined if t not in stopwords.words('english')] for each_refined in refined_tokenized_article]



## word lemmatizer 

word_lemmatizer = WordNetLemmatizer()

lemmatized_words = [[word_lemmatizer.lemmatize(t) for t in each] for each in removal_of_english_characters]


## Now lets count the total occurance using the dictionary 

dictionary = Dictionary(lemmatized_words)

### lets give it id and make it bag of words

corpus = [dictionary.doc2bow(sentence) for sentence in lemmatized_words]
tfidf = TfidfModel(corpus)
tfidf_weights = tfidf[corpus[23]]

for word_id, weight in tfidf_weights:
    print(f"{dictionary[word_id]} -> {weight:.4f}")

[['So', 'is', 'this', 'swearing', 'or', 'no', 'swearing'], ['”', 'In', 'a', 'darkened', 'soundstage', 'on', 'the', 'outskirts', 'of', 'London', ',', 'Abel', 'Tesfaye', 'is', 'wondering', 'if', 'he', 'can', 'say', '“', 'fuck', '”', 'or', 'not'], ['Tesfaye', ',', 'better', 'known', 'as', 'breakout', 'pop', 'sensation', 'the', 'Weeknd', ',', 'is', 'at', 'a', 'rehearsal', 'for', 'Later'], ['With', 'Jools', 'Holland', ',', 'the', 'BBC', 'music', 'show', ',', 'about', 'to', 'soundcheck', 'his', 'smash', 'hit', '“', 'The', 'Hills', ',', '”', 'a', 'four-minute', 'horror-movie', 'booty', 'call', 'featuring', 'more', 'than', 'a', 'dozen', 'f-bombs'], ['For', 'Tesfaye', ',', 'that', '’', 's', 'relatively', 'clean', ',', 'but', 'he', 'knows', 'the', 'pensioners', 'in', 'Twickenham', 'might', 'disagree'], ['So', 'when', 'the', 'verdict', 'comes', 'back', '“', 'no', 'swearing', ',', '”', 'he', 'nods', 'and', 'smoothly', 'pivots', 'to', 'a', 'censored', 'version', '—', 'a', 'small', 'gesture', 'that'