#09-02 ___ PROCESSING OUR LANGUAGE BEFORE FEEDING INTO OUR MODELS.

# Natural language processing ( NLP )


In [None]:
import string

In [None]:
text = "Ugh... The deliveries were DELAYED! I seriously hate waiting... #annoyed"

In [None]:
# 1. Convert all letters to either in lower case or upper case but all should be converted.
text_lower = text.lower()
text_lower

In [None]:
# 2. Remove punctuations using string translation
translator = str.maketrans('','',string.punctuation)
clean_text = text_lower.translate(translator)

In [None]:
print("Original: ",text);
print("Cleaned: ",clean_text)

# Tokenization


In [None]:
# We cannot read whole sentence at once we read word by word so to split them.
# It is the foundational process of breaking down raw text into smaller, meaningful units called tokens, such as words, sub-words, or characters.

In [None]:
# METHOD A:
tokens = clean_text.split()

In [None]:
print("Tokens: ",tokens);
print("Token Count: ",len(tokens))

In [None]:
# We cannot read "microtransactional" in one go like we cannot read a sentence so the main sophisticated way to let computer understand our language also.

In [None]:
# METHOD B:
from transformers import AutoTokenizer

In [None]:
# We use BERT tokenizer ( Standard in industry )
tokensizer = AutoTokenizer.from_pretrained('bert-base-uncased')
complex_text = "The microtransactional system was counterintuitive."

In [None]:
print("Tokens: ",tokensizer.tokenize(complex_text))

In [None]:
# Lets try with a simple sentence now!
tokensizer = AutoTokenizer.from_pretrained('bert-base-uncased')
print("Tokens: ",tokensizer.tokenize(clean_text))

In [None]:
# It cannot understand our language! so we use the installed version for it.
# It costs us a lot to use a tokenizer so many countries are struggling for it.
# !pip install indic-nlp-library
from indicnlp.tokenize import indic_tokenize

text = 'मैं आज खुश हूँ'
tokens = indic_tokenize.trivial_tokenize(text)
print(tokens)

# STOP WORD REMOVAL


In [None]:
# THE , IS , I , AM --> These are basically words which mean soo much but only used for a structured sentence.
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# METHOD A
text = "Ugh... The deliveries were DELAYED! I seriously hate waiting... #annoyed"
text_lower = text.lower()
translator = str.maketrans('','',string.punctuation)
clean_text = text_lower.translate(translator)
tokens = clean_text.split()

filtered_tokens = [word for word in tokens if word not in stop_words]
print("Input: ",tokens)
print("Result: ",filtered_tokens)

# STEMMING VS. LEMMATIZATION.


In [None]:
# Stemming -> used to chop words to convert into root form but it might result in non-words
# Lemmatization -> searches for a dictionary root form and add back meaning to our word.
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["deliveries","waiting","delayed","studies"]

for w in words:
  print(w,"|",stemmer.stem(w),"|",lemmatizer.lemmatize(w))

# VECTORIZATION ( TEXT TO NUMBERS )


In [None]:
# This way of numbering text introduces bias -> so to prevent label encoding we use one-hot encoding but soon
# if we realise in one-hot encoding it creates multiple columns which takes up lots of space.

# HERE IT COMES -> BAG OF WORDS
# All the sentences having similar words carry similar meaning
# BIG PROBLEMS OF BAG OF WORDS
# 1. FOR THIS SYSTEM : "DOG BITES MAN" WILL BE SAME AS "MAN BITES DOG" but in reality it is different.

In [None]:
# VECTOR EMBEDDINGS:
# !pip install sentence_transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Give it words
words = ["She is courageous",'She acts strong']
embeddings = model.encode(words)

# 3. see the vectors
for i in range(2):
  print("Word: ",words[i])
  print("Vector: ",embeddings[i][:3])

In [None]:
# COS_SIM IS USED TO TELL HOW SIMILAR OUR WORDS ARE.
from sentence_transformers import util
util.cos_sim(embeddings[0],embeddings[1])

In [55]:
import json

# change filename if needed
nb_file = "NLP.ipynb"

with open(nb_file, "r", encoding="utf-8") as f:
    nb = json.load(f)

# REMOVE widgets metadata if present
nb["metadata"].pop("widgets", None)

# ALSO remove widget outputs inside cells
for cell in nb.get("cells", []):
    cell.pop("metadata", None)
    if "outputs" in cell:
        cell["outputs"] = []

with open("NLP_CLEAN.ipynb", "w", encoding="utf-8") as f:
    json.dump(nb, f, indent=2)

print("✅ Clean notebook saved as NLP_CLEAN.ipynb")

In [57]:
!ls