#09-02 ___ PROCESSING OUR LANGUAGE BEFORE FEEDING INTO OUR MODELS.

# Natural language processing ( NLP )


In [13]:
import string

In [14]:
text = "Ugh... The deliveries were DELAYED! I seriously hate waiting... #annoyed"

In [15]:
# 1. Convert all letters to either in lower case or upper case but all should be converted.
text_lower = text.lower()
text_lower

'ugh... the deliveries were delayed! i seriously hate waiting... #annoyed'

In [16]:
# 2. Remove punctuations using string translation
translator = str.maketrans('','',string.punctuation)
clean_text = text_lower.translate(translator)

In [17]:
print("Original: ",text);
print("Cleaned: ",clean_text)

Original:  Ugh... The deliveries were DELAYED! I seriously hate waiting... #annoyed
Cleaned:  ugh the deliveries were delayed i seriously hate waiting annoyed


# Tokenization


In [12]:
# We cannot read whole sentence at once we read word by word so to split them.
# It is the foundational process of breaking down raw text into smaller, meaningful units called tokens, such as words, sub-words, or characters.

In [13]:
# METHOD A:
tokens = clean_text.split()

In [14]:
print("Tokens: ",tokens);
print("Token Count: ",len(tokens))

Tokens:  ['ugh', 'the', 'deliveries', 'were', 'delayed', 'i', 'seriously', 'hate', 'waiting', 'annoyed']
Token Count:  10


In [15]:
# We cannot read "microtransactional" in one go like we cannot read a sentence so the main sophisticated way to let computer understand our language also.

In [16]:
# METHOD B:
from transformers import AutoTokenizer

In [17]:
# We use BERT tokenizer ( Standard in industry )
tokensizer = AutoTokenizer.from_pretrained('bert-base-uncased')
complex_text = "The microtransactional system was counterintuitive."

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
print("Tokens: ",tokensizer.tokenize(complex_text))

Tokens:  ['the', 'micro', '##tra', '##ns', '##act', '##ional', 'system', 'was', 'counter', '##int', '##uit', '##ive', '.']


In [24]:
# Lets try with a simple sentence now!
tokensizer = AutoTokenizer.from_pretrained('bert-base-uncased')
print("Tokens: ",tokensizer.tokenize(clean_text))

Tokens:  ['u', '##gh', 'the', 'deliveries', 'were', 'delayed', 'i', 'seriously', 'hate', 'waiting', 'annoyed']


In [1]:
# It cannot understand our language! so we use the installed version for it.
# It costs us a lot to use a tokenizer so many countries are struggling for it.
# !pip install indic-nlp-library
from indicnlp.tokenize import indic_tokenize

text = 'मैं आज खुश हूँ'
tokens = indic_tokenize.trivial_tokenize(text)
print(tokens)

['मैं', 'आज', 'खुश', 'हूँ']


# STOP WORD REMOVAL


In [12]:
# THE , IS , I , AM --> These are basically words which mean soo much but only used for a structured sentence.
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# METHOD A
text = "Ugh... The deliveries were DELAYED! I seriously hate waiting... #annoyed"
text_lower = text.lower()
translator = str.maketrans('','',string.punctuation)
clean_text = text_lower.translate(translator)
tokens = clean_text.split()

filtered_tokens = [word for word in tokens if word not in stop_words]
print("Input: ",tokens)
print("Result: ",filtered_tokens)

Input:  ['ugh', 'the', 'deliveries', 'were', 'delayed', 'i', 'seriously', 'hate', 'waiting', 'annoyed']
Result:  ['ugh', 'deliveries', 'delayed', 'seriously', 'hate', 'waiting', 'annoyed']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# STEMMING VS. LEMMATIZATION.


In [18]:
# Stemming -> used to chop words to convert into root form but it might result in non-words
# Lemmatization -> searches for a dictionary root form and add back meaning to our word.
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["deliveries","waiting","delayed","studies"]

for w in words:
  print(w,"|",stemmer.stem(w),"|",lemmatizer.lemmatize(w))

[nltk_data] Downloading package wordnet to /root/nltk_data...


deliveries | deliveri | delivery
waiting | wait | waiting
delayed | delay | delayed
studies | studi | study


# VECTORIZATION ( TEXT TO NUMBERS )


In [None]:
# This way of numbering text introduces bias -> so to prevent label encoding we use one-hot encoding but soon
# if we realise in one-hot encoding it creates multiple columns which takes up lots of space.

# HERE IT COMES -> BAG OF WORDS
# All the sentences having similar words carry similar meaning
# BIG PROBLEMS OF BAG OF WORDS
# 1. FOR THIS SYSTEM : "DOG BITES MAN" WILL BE SAME AS "MAN BITES DOG" but in reality it is different.

In [28]:
# VECTOR EMBEDDINGS:
# !pip install sentence_transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Give it words
words = ["She is courageous",'She acts strong']
embeddings = model.encode(words)

# 3. see the vectors
for i in range(2):
  print("Word: ",words[i])
  print("Vector: ",embeddings[i][:3])

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Word:  She is courageous
Vector:  [ 0.02741979  0.05577177 -0.0109375 ]
Word:  She acts strong
Vector:  [ 0.00624397 -0.11964463  0.01222851]


In [29]:
# COS_SIM IS USED TO TELL HOW SIMILAR OUR WORDS ARE.
from sentence_transformers import util
util.cos_sim(embeddings[0],embeddings[1])

tensor([[0.5867]])