In [1]:
filtered_tokens = ['Hello!','This','is','a','test','Let\'s','remove','punctuation!']

import string
punctuation_remove = ["".join(char for char in word if char not in string.punctuation) for word in filtered_tokens]
punctuation_remove = [word for word in punctuation_remove if word]
print(punctuation_remove)

['Hello', 'This', 'is', 'a', 'test', 'Lets', 'remove', 'punctuation']


In [2]:
import re

def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z\s]','',text)
    return cleaned_text

paragraph = "Here's a paragraph with numbers 123 and special characters! Like @#$%."
print(clean_text(paragraph))  # Output: "Here's a paragraph with numbers and special characters

Heres a paragraph with numbers  and special characters Like 


### **Remove Whitespace**

In [3]:
text_with_whitespace = " This is an example    sentences with   extra spaces. "

" ".join(text_with_whitespace.split())

'This is an example sentences with extra spaces.'

### **Tokensation**

In [4]:
import nltk

from nltk.tokenize import word_tokenize
sentence = "Natural Language Processing With NLTK is fun and exciting!"
word_tokenize(sentence)

['Natural',
 'Language',
 'Processing',
 'With',
 'NLTK',
 'is',
 'fun',
 'and',
 'exciting',
 '!']

In [5]:
print(word_tokenize(sentence))

['Natural', 'Language', 'Processing', 'With', 'NLTK', 'is', 'fun', 'and', 'exciting', '!']


In [6]:
from nltk.tokenize import sent_tokenize
sent_tokenize(sentence)

['Natural Language Processing With NLTK is fun and exciting!']

In [7]:
sentence1 = "**Sentence tokenization** is the process of dividing a text or document into individual sentences. It is a crucial step in natural language processing (NLP) that helps in understanding and analyzing text at the sentence level. By splitting text into manageable units, sentence tokenization enables various downstream tasks such as text summarization, sentiment analysis, and question answering. For instance, in machine translation or chatbots, identifying sentence boundaries ensures accurate translations or responses. Additionally, it is especially useful in preprocessing tasks like extracting keywords, performing entity recognition, or segmenting large paragraphs for detailed analysis. Tools like NLTK, spaCy, and Hugging Face libraries often include pre-built sentence tokenization methods for efficient implementation."
sent_tokenize(sentence1)

['**Sentence tokenization** is the process of dividing a text or document into individual sentences.',
 'It is a crucial step in natural language processing (NLP) that helps in understanding and analyzing text at the sentence level.',
 'By splitting text into manageable units, sentence tokenization enables various downstream tasks such as text summarization, sentiment analysis, and question answering.',
 'For instance, in machine translation or chatbots, identifying sentence boundaries ensures accurate translations or responses.',
 'Additionally, it is especially useful in preprocessing tasks like extracting keywords, performing entity recognition, or segmenting large paragraphs for detailed analysis.',
 'Tools like NLTK, spaCy, and Hugging Face libraries often include pre-built sentence tokenization methods for efficient implementation.']

In [8]:
import nltk
from nltk.tokenize import RegexpTokenizer

# Example text
text = "Hello!"

# Character tokenization using NLTK
char_tokenizer = RegexpTokenizer(r'.')  # '.' matches any character
character_tokens = char_tokenizer.tokenize(text)

# Print the result
print(character_tokens)


['H', 'e', 'l', 'l', 'o', '!']


In [9]:
# Example text
text = "Hello!"

# Character tokenization
character_tokens = list(text)

# Print the result
print(character_tokens)


['H', 'e', 'l', 'l', 'o', '!']


## **Stop-Words**

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "This is an example sentence, demonstrating the removal of stop words."
tokens = word_tokenize(text)

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['example', 'sentence', ',', 'demonstrating', 'removal', 'stop', 'words', '.']


In [11]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Example text
text = "Hello, this is a simple example!"

# Character tokenization using NLTK
char_tokenizer = RegexpTokenizer(r'.')  # '.' matches any character
character_tokens = char_tokenizer.tokenize(text)

# Load stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_tokens = [char for char in character_tokens if char.lower() not in stop_words and char.isalpha()]

# Print the result
print("Original Tokens:", character_tokens)
print("Filtered Tokens (Stopwords Removed):", filtered_tokens)


Original Tokens: ['H', 'e', 'l', 'l', 'o', ',', ' ', 't', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'i', 'm', 'p', 'l', 'e', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '!']
Filtered Tokens (Stopwords Removed): ['H', 'e', 'l', 'l', 'h', 'p', 'l', 'e', 'e', 'x', 'p', 'l', 'e']


## **Stemming and Lemmatization**

In [12]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running","runner","ran","went","easily","fairly"]
s = [stemmer.stem(word) for word in words]
print(s)

['run', 'runner', 'ran', 'went', 'easili', 'fairli']


In [13]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ["running","runner","ran","went","easily","fairly"] 
l = [lemmatizer.lemmatize(word) for word in words]
print(l)

['running', 'runner', 'ran', 'went', 'easily', 'fairly']


## **Text Normalization and Segmentation**

In [14]:
paragraph = "This is an Example Pragraph with Mixed CASE letters"
print(paragraph.lower())

this is an example pragraph with mixed case letters


In [15]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

text = "Natural language is an intresting field. It has many applications in AI"

sent_tokenize(text)

['Natural language is an intresting field.', 'It has many applications in AI']

In [16]:
word = word_tokenize(text)
print(word)

['Natural', 'language', 'is', 'an', 'intresting', 'field', '.', 'It', 'has', 'many', 'applications', 'in', 'AI']


1. Text Normalization
- Definition:
Text normalization is the process of transforming raw text into a consistent, structured format. The key steps include:

1. Converting text to lowercase.
2. Removing punctuation and special characters.
3. Tokenizing the text into meaningful units (words or sentences).
4. Removing stopwords.
5. Stemming or lemmatizing words.

- Why Normalize?
Normalization helps reduce variability in textual data, ensuring that semantically similar text is treated consistently. For example, "running," "ran," and "runs" can be normalized to their root form "run."

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



# Sample text
text = "NLTK is a powerful library for natural language processing. It simplifies preprocessing!"

# Convert to lowercase
text = text.lower()

# Tokenize into words
tokens = word_tokenize(text)

# Remove punctuation and stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

# Stemming
stemmer = PorterStemmer()
normalized_tokens = [stemmer.stem(word) for word in tokens]

print("Normalized Tokens:", normalized_tokens)


Normalized Tokens: ['nltk', 'power', 'librari', 'natur', 'languag', 'process', 'simplifi', 'preprocess']


2. Text Segmentation
- Definition:
Text segmentation is the process of dividing text into smaller units, such as sentences or paragraphs (sentence segmentation) or words (tokenization).

- Types of Segmentation:
1. Sentence Segmentation: Breaking a text into individual sentences.
2. Word Segmentation: Splitting sentences into individual words.

- Why Segment?
Segmentation helps in analyzing text at different granularities. Sentence segmentation is crucial for tasks like summarization, while word segmentation is essential for tasks like sentiment analysis

## **PoS and NER**

In [18]:
import nltk
import numpy as nnp
from nltk.tokenize import word_tokenize

sentence = "The quick brown fox jumps over the lazy dog."

tokens = word_tokenize(sentence)
tag = nltk.pos_tag(tokens)
tag

[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'VBZ'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN'),
 ('.', '.')]

In [19]:
import nltk
from nltk import word_tokenize, pos_tag, RegexpParser, ne_chunk

sentence = "Apple is looking at buying U.K. startup for $1 billinon."

token = word_tokenize(sentence)
token

['Apple',
 'is',
 'looking',
 'at',
 'buying',
 'U.K.',
 'startup',
 'for',
 '$',
 '1',
 'billinon',
 '.']

In [20]:
tag = pos_tag(token)
tag

[('Apple', 'NNP'),
 ('is', 'VBZ'),
 ('looking', 'VBG'),
 ('at', 'IN'),
 ('buying', 'VBG'),
 ('U.K.', 'NNP'),
 ('startup', 'NN'),
 ('for', 'IN'),
 ('$', '$'),
 ('1', 'CD'),
 ('billinon', 'NN'),
 ('.', '.')]

In [21]:
ner = ne_chunk(tag)
print(ner)

(S
  (GPE Apple/NNP)
  is/VBZ
  looking/VBG
  at/IN
  buying/VBG
  U.K./NNP
  startup/NN
  for/IN
  $/$
  1/CD
  billinon/NN
  ./.)


## **One-Hot-Encoding**

In [22]:
import numpy as np 

def one_hot_encode(word,vocab):
    vector = np.zeros(len(vocab))
    vector[vocab.index(word)] = 1
    return vector

vocab = ["I","Love","Machine","Learning"]
one_hot = one_hot_encode("Love",vocab)
print(one_hot)

[0. 1. 0. 0.]


In [23]:
import pandas as pd

data = {'Color':['Red','Blue','Green','Red','Green']}
df = pd.DataFrame(data)

print(df)

   Color
0    Red
1   Blue
2  Green
3    Red
4  Green


In [24]:
one_hot_encode = pd.get_dummies(df,columns=['Color']).astype(int)
print(one_hot_encode)

   Color_Blue  Color_Green  Color_Red
0           0            0          1
1           1            0          0
2           0            1          0
3           0            0          1
4           0            1          0


In [25]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the vocabulary
vocabulary = ["apple", "banana", "orange"]

# Convert the vocabulary to a column vector
vocab_array = np.array(vocabulary).reshape(-1, 1)

print("Vocab Array:")
print(vocab_array)

# Initialize the OneHotEncoder
one_hot_encode = OneHotEncoder(sparse_output=False)

# Fit the encoder to the vocabulary
one_hot_encode.fit(vocab_array)

# Transform the vocabulary to one-hot encoded vectors
one_hot_vectors = one_hot_encode.transform(vocab_array)

# Display the one-hot encoded vectors
for word, vec in zip(vocabulary, one_hot_vectors):
    print(word, vec)


Vocab Array:
[['apple']
 ['banana']
 ['orange']]
apple [1. 0. 0.]
banana [0. 1. 0.]
orange [0. 0. 1.]


## **Bag of words**

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

corpus = [
    "I love machine learning",
    "Machine learning is awesome"
]

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()
feature_names

array(['awesome', 'is', 'learning', 'love', 'machine'], dtype=object)

In [27]:
dense = x.todense()
denselist = dense.tolist()

df = pd.DataFrame(denselist,columns=feature_names)
df

Unnamed: 0,awesome,is,learning,love,machine
0,0,0,1,1,1
1,1,1,1,0,1


## **TF-IDF**

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = [
    "I love machine learning",
    "Machine learning is awesome"
]

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
print(x)
print("---------------------------------")

feature_names = vectorizer.get_feature_names_out()
print(feature_names)
print("------------------------------------")

dense = x.todense()
denselist = dense.tolist()

df = pd.DataFrame(denselist,columns=feature_names)
print(df)
print("-------------------------------------------------")

  (0, 3)	0.7049094889309326
  (0, 4)	0.5015489070943787
  (0, 2)	0.5015489070943787
  (1, 4)	0.40993714596036396
  (1, 2)	0.40993714596036396
  (1, 1)	0.5761523551647353
  (1, 0)	0.5761523551647353
---------------------------------
['awesome' 'is' 'learning' 'love' 'machine']
------------------------------------
    awesome        is  learning      love   machine
0  0.000000  0.000000  0.501549  0.704909  0.501549
1  0.576152  0.576152  0.409937  0.000000  0.409937
-------------------------------------------------


## **Word2Vec**

In [29]:
from gensim.models import Word2Vec

# Input sentences
sentences = [["The", "cat", "sat", "on", "the", "mat"],
             ["the", "dog", "barked", "at", "the", "cat"]]

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=2, min_count=1, sg=0)

# Print the model details
print(model.wv['sat'])


[ 8.1681199e-03 -4.4430327e-03  8.9854337e-03  8.2536647e-03
 -4.4352221e-03  3.0310510e-04  4.2744912e-03 -3.9263200e-03
 -5.5599655e-03 -6.5123225e-03 -6.7073823e-04 -2.9592158e-04
  4.4630850e-03 -2.4740540e-03 -1.7260908e-04  2.4618758e-03
  4.8675989e-03 -3.0808449e-05 -6.3394094e-03 -9.2608072e-03
  2.6657581e-05  6.6618943e-03  1.4660227e-03 -8.9665223e-03
 -7.9386048e-03  6.5519023e-03 -3.7856805e-03  6.2549924e-03
 -6.6810320e-03  8.4796622e-03 -6.5163244e-03  3.2880199e-03
 -1.0569858e-03 -6.7875278e-03 -3.2875966e-03 -1.1614120e-03
 -5.4709399e-03 -1.2113475e-03 -7.5633135e-03  2.6466595e-03
  9.0701487e-03 -2.3772502e-03 -9.7651005e-04  3.5135616e-03
  8.6650876e-03 -5.9218528e-03 -6.8875779e-03 -2.9329848e-03
  9.1476962e-03  8.6626766e-04 -8.6784009e-03 -1.4469790e-03
  9.4794659e-03 -7.5494875e-03 -5.3580985e-03  9.3165627e-03
 -8.9737261e-03  3.8259076e-03  6.6544057e-04  6.6607012e-03
  8.3127534e-03 -2.8507852e-03 -3.9923131e-03  8.8979173e-03
  2.0896459e-03  6.24894

In [30]:
print("The End")

The End
