# NLP 

In [20]:
# Extracting required Text from given data using Regular Expression

import re

text = """Employee phone number is 9876543210,(987)-654-4321 and his salary is 1500$ and his expenses are 500 dollars and you
        can contact them via email also. The email is abc_a@gmail.com, xyz_123@gmail.com, ccc@xyz.io and their order numbers 
        are order 41215, order id 2213, order # 1234"""

pattern1 = "\d{10}|\(\d{3}\)\-\d{3}\-\d{4}"
pattern2 = "\d{4}\$|\d{3} dollars"
pattern3 = "[a-zA-Z0-9_]*@[a-zA-Z0-9_]*.[a-zA-Z]*"
pattern4 = "order[^\d]*(\d*)"
match1 = re.findall(pattern1, text)
match2 = re.findall(pattern2, text)
match3 = re.findall(pattern3, text)
match4 = re.findall(pattern4, text)
match1

['9876543210', '(987)-654-4321']

In [21]:
match2

['1500$', '500 dollars']

In [22]:
match3

['abc_a@gmail.com', 'xyz_123@gmail.com', 'ccc@xyz.io']

In [23]:
match4

['41215', '2213', '1234']

In [19]:
data = '''Follow our leader Elon musk on Twitter 
        here: https://twitter.com/elonmusk, more information 
        on Tesla's products can be found at https://www.tesla.com/. 
        Also here are leading influencers for tesla-related news, 
        https://twitter.com/teslarati
        https://twitter.com/dummy_tesla
        https://twitter.com/dummy_2_tesla'''

pattern = "https\:\/\/twitter.com\/([a-zA-Z0-9_]*)"
match = re.findall(pattern, data)
match

['elonmusk', 'teslarati', 'dummy_tesla', 'dummy_2_tesla']

In [44]:
text = '''Name: Marta Sharapova Date: 5/11/2022

    Address: 9 tennis court, new Russia, DC

    Prednisone 20 mg
    Lialda 2.4 gram

    Directions:

    Prednisone, Taper 5 mg every 3 days,
    Finish in 2.5 weeks a
    Lialda - take 2 pill everyday for 1 month

    Refill: 2 times'''

pattern1 = "Name:(.*)Date"
pattern2 = "Address:(.*)"
pattern3 = "Address[^\n]*(.*)Directions"
pattern4 = "Directions:[^\n]*(.*)Refill"
pattern5 = "Refill:(.*)"

match1 = re.findall(pattern1, text)
match2 = re.findall(pattern2, text)
match3 = re.findall(pattern3, text, flags=re.DOTALL)
match4 = re.findall(pattern4, text, flags=re.DOTALL)
match5 = re.findall(pattern5, text)

match1[0].strip()

'Marta Sharapova'

In [45]:
match2[0].strip()

'9 tennis court, new Russia, DC'

In [46]:
match3[0].strip()

'Prednisone 20 mg\n    Lialda 2.4 gram'

In [48]:
match4[0].strip()

'Prednisone, Taper 5 mg every 3 days,\n    Finish in 2.5 weeks a\n    Lialda - take 2 pill everyday for 1 month'

In [50]:
match5[0].strip()

'2 times'

In [6]:
# Tokenization in spacy

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Python is most widely used language in the world at no.1 position.")

for token in doc:
    print(token)

Python
is
most
widely
used
language
in
the
world
at
no.1
position
.


In [15]:
text = '''virat@gmail.com
         virat kohli
         rohit@email.com
         Rohit sharma'''

doc =nlp(text)
email = []
for token in doc:
    if token.like_email:
        email.append(token.text)
email

['virat@gmail.com', 'rohit@email.com']

In [22]:
# Extracting all urls from a given text

text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)
url = []

for token in doc:
    if token.like_url:
        url.append(token.text)
        
url

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [23]:
data_websites = [token.text for token in doc if token.like_url]
data_websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [20]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

doc = nlp(transactions)
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)

two $
500 €


In [31]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [26]:
# NLP Pipeline

doc = nlp("Python is most widely used language in the world at no.1 position.")

for token in doc:
    print(token, "|",token.pos_,"|",token.lemma_)

Python | PROPN | Python
is | AUX | be
most | ADV | most
widely | ADV | widely
used | VERB | use
language | NOUN | language
in | ADP | in
the | DET | the
world | NOUN | world
at | ADP | at
no.1 | ADJ | no.1
position | NOUN | position
. | PUNCT | .


In [35]:
doc = nlp("Tesla Inc bought twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [40]:
text = "Microsoft was founded by Bill Gates and Paul Allen in Albuquerque on April 4, 1975."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text,"|", ent.label_,"|", spacy.explain(ent.label_))

Microsoft | ORG | Companies, agencies, institutions, etc.
Bill Gates | PERSON | People, including fictional
Paul Allen | PERSON | People, including fictional
Albuquerque | GPE | Countries, cities, states
April 4, 1975 | DATE | Absolute or relative dates or periods


In [47]:
# Stemming using NLTK

from nltk.stem import PorterStemmer

words = ["eating","eats","eat","ate","adjustable","rafting","ability","meeting"]
stemmer = PorterStemmer()

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [52]:
# lemmatization using spacy

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")

for token in doc:
    print(token, "|", token.lemma_)

eating | eat
eats | eat
eat | eat
ate | eat
adjustable | adjustable
rafting | raft
ability | ability
meeting | meeting
better | well


In [54]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

for token in doc:
    print(token, "|", token.lemma_)

Bro | bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [58]:
ar = nlp.get_pipe("attribute_ruler")
ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]], {"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

for token in doc:
    print(token, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [62]:
# NLP Parts Of Speech Tagging

ca = nlp("India has succesfully launched chandrayaan-3 from satish dhawan space center")

for token in ca:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

India | PROPN | proper noun
has | AUX | auxiliary
succesfully | ADV | adverb
launched | VERB | verb
chandrayaan-3 | PROPN | proper noun
from | ADP | adposition
satish | PROPN | proper noun
dhawan | PROPN | proper noun
space | NOUN | noun
center | NOUN | noun


In [66]:
doc = nlp("Wow! India won the cricket worldcup and became no.1")

for token in doc:
    print(token, "|", token.pos_, "|", token.tag_, "|", spacy.explain(token.tag_))

Wow | INTJ | UH | interjection
! | PUNCT | . | punctuation mark, sentence closer
India | PROPN | NNP | noun, proper singular
won | VERB | VBD | verb, past tense
the | DET | DT | determiner
cricket | NOUN | NN | noun, singular or mass
worldcup | NOUN | NN | noun, singular or mass
and | CCONJ | CC | conjunction, coordinating
became | VERB | VBD | verb, past tense
no.1 | ADJ | JJ | adjective (English), other noun-modifier (Chinese)


In [70]:
learning_text="""Microsoft Corp. today announced the following results for the quarter ended December 31, 2021, as compared to the corresponding period of last fiscal year:

·         Revenue was $51.7 billion and increased 20%
·         Operating income was $22.2 billion and increased 24%
·         Net income was $18.8 billion and increased 21%
·         Diluted earnings per share was $2.48 and increased 22%
“Digital technology is the most malleable resource at the world’s disposal to overcome constraints and reimagine everyday work and life,” said Satya Nade"lla, chairman and chief executive officer of Microsoft. “As tech as a percentage of global GDP continues to increase, we are innovating and investing across diverse and growing markets, with a common underlying technology stack and an operating model that reinforces a common strategy, culture, and sense of purpose.”
“Solid commercial execution, represented by strong bookings growth driven by long-term Azure commitments, increased Microsoft Cloud revenue"""

doc = nlp(learning_text)

filtered_tokens = []
for token in doc:
    if token.pos_ not in ["PUNCT","SPACE","X"]:
        filtered_tokens.append(token) 

In [74]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 10,
 92: 40,
 100: 23,
 90: 9,
 85: 13,
 93: 13,
 97: 23,
 98: 1,
 84: 17,
 103: 10,
 87: 6,
 99: 4,
 89: 11,
 86: 2,
 94: 3,
 95: 2}

In [76]:
for k,v in count.items():
    print(doc.vocab[k].text, "|", v)

PROPN | 10
NOUN | 40
VERB | 23
DET | 9
ADP | 13
NUM | 13
PUNCT | 23
SCONJ | 1
ADJ | 17
SPACE | 10
AUX | 6
SYM | 4
CCONJ | 11
ADV | 2
PART | 3
PRON | 2


In [78]:
# extracting Noun and Num Parts Of Speech From text

text = '''Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the 
          economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based 
          measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for 
          an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the 
          summer of 1982.\n\nRemoving volatile food and ene'''

doc = nlp(text)
Noun_tokens = []
Num_tokens = []

for token in doc:
    if token.pos_ == "NOUN":
        Noun_tokens.append(token)
    elif token.pos_ == "NUM":
        Num_tokens.append(token)

In [79]:
Num_tokens[:10]

[8.3, 8.1, 1982]

In [80]:
Noun_tokens[:10]

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices]

In [82]:
count = doc.count_by(spacy.attrs.POS)
count

{92: 25,
 100: 9,
 86: 4,
 85: 11,
 96: 7,
 97: 9,
 90: 12,
 95: 2,
 87: 3,
 89: 4,
 103: 6,
 84: 6,
 93: 3,
 94: 1,
 98: 1}

In [86]:
for k,v in count.items():
    print(doc.vocab[k].text, "|", v)

NOUN | 25
VERB | 9
ADV | 4
ADP | 11
PROPN | 7
PUNCT | 9
DET | 12
PRON | 2
AUX | 3
CCONJ | 4
SPACE | 6
ADJ | 6
NUM | 3
PART | 1
SCONJ | 1


In [1]:
# Named Entity Recognition in NLP using spacy


import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("Mukesh Ambani from India is the chairman of Reliance Industries earning $82 billion")

for ent in doc.ents:
    print(ent, "|", ent.label_, "|", spacy.explain(ent.label_))

Mukesh Ambani | PERSON | People, including fictional
India | GPE | Countries, cities, states
Reliance Industries | ORG | Companies, agencies, institutions, etc.
$82 billion | MONEY | Monetary values, including unit


In [4]:
from spacy import displacy

displacy.render(doc, style="ent")

In [103]:
text = """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal food, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

doc = nlp(text)
locations = []

for ent in doc.ents:
    if ent.label_ == "GPE":
        locations.append(ent)

print(locations)
print(len(locations))

[India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]
7


In [104]:
# Text representation - Bag of Words(BOW)

import pandas as pd
import numpy as np

In [107]:
df = pd.read_csv(r"C:\Users\boppu\OneDrive\Desktop\spam_ham_dataset.csv")
df.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [109]:
df.label.value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

In [118]:
df.shape

(5171, 3)

In [110]:
df["spam"] = df["label"].apply(lambda x: 1 if x == "spam" else 0)

In [111]:
df.head()

Unnamed: 0,label,text,spam
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [112]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.text, df.spam, test_size=0.2)

In [120]:
x_train.shape

(4136,)

In [121]:
x_test.shape

(1035,)

In [130]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)

In [131]:
x_train_cv.shape

(4136, 44966)

In [133]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [134]:
from sklearn.metrics import classification_report

x_test_cv = cv.transform(x_test)

y_pred = model.predict(x_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       742
           1       0.97      0.95      0.96       293

    accuracy                           0.98      1035
   macro avg       0.98      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [137]:
emails = [
    "Let us play cricket",
    "upto 20% discount $10000, hurry up offer closes soon"
]

emails_count = cv.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [142]:
# sklearn pipeline

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
])

In [143]:
clf.fit(x_train, y_train)

In [145]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       742
           1       0.97      0.95      0.96       293

    accuracy                           0.98      1035
   macro avg       0.98      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [2]:
# Stop words and removing of stop words

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [4]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [34]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(no_stop_words)

In [35]:
preprocess("We just opened our wings, the flying part is coming soon")

'opened wings flying coming soon'

In [12]:
import pandas as pd
import numpy as np

df = pd.read_json(r"C:\Users\boppu\OneDrive\Desktop\combined.json", lines = True)
df.shape

(13087, 6)

In [13]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [18]:
df = df[df["topics"].str.len()!=0]
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [19]:
df.shape

(4688, 6)

In [22]:
df = df.head(100)
df.shape

(100, 6)

In [28]:
len(df["contents"].iloc[4])

5504

In [36]:
df["contents_new"] = df["contents"].apply(preprocess)
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...


In [37]:
len(df["contents_new"].iloc[4])

4217

In [38]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

doc = nlp(text)
stop_words_count = 0
total_words_count = 0

for token in doc:
    if token.is_stop:
        stop_words_count += 1
    total_words_count += 1

In [41]:
print(f"Total stop words in text are",{stop_words_count})
print(f"Total words_count in text are",{total_words_count})

Total stop words in text are {40}
Total words_count in text are {160}


In [44]:
percentage_of_stop_words = (stop_words_count/total_words_count)*100
percentage_of_stop_words

25.0

In [53]:
# Text Representation using Bag of n-Grams

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(1,3))
v.fit(["India has launched chandrayaan3 successfully"])
v.vocabulary_

{'india': 5,
 'has': 2,
 'launched': 8,
 'chandrayaan3': 0,
 'successfully': 11,
 'india has': 6,
 'has launched': 3,
 'launched chandrayaan3': 9,
 'chandrayaan3 successfully': 1,
 'india has launched': 7,
 'has launched chandrayaan3': 4,
 'launched chandrayaan3 successfully': 10}

In [54]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [55]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)       

In [56]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [58]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [62]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [65]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [70]:
v.transform(["Hulk eat pizza "]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [78]:
# Text representation using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

v = TfidfVectorizer()
tranfformed_output = v.fit_transform(corpus)
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [83]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    indx = v.vocabulary_.get(word)
    print(f"{word} {v.idf_[indx]}")

already 2.386294361119891
am 2.386294361119891
amazon 2.386294361119891
and 2.386294361119891
announcing 1.2876820724517808
apple 2.386294361119891
are 2.386294361119891
ate 2.386294361119891
biryani 2.386294361119891
dot 2.386294361119891
eating 1.9808292530117262
eco 2.386294361119891
google 2.386294361119891
grapes 2.386294361119891
iphone 2.386294361119891
ironman 2.386294361119891
is 1.1335313926245225
loki 2.386294361119891
microsoft 2.386294361119891
model 2.386294361119891
new 1.2876820724517808
pixel 2.386294361119891
pizza 2.386294361119891
surface 2.386294361119891
tesla 2.386294361119891
thor 2.386294361119891
tomorrow 1.2876820724517808
you 2.386294361119891


In [85]:
tranfformed_output.toarray()[:2]

array([[0.24266547, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24266547, 0.        , 0.        ,
        0.40286636, 0.        , 0.        , 0.        , 0.        ,
        0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
        0.        , 0.        , 0.72799642, 0.        , 0.        ,
        0.24266547, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30652086,
        0.5680354 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5680354 ,
        0.        , 0.26982671, 0.        , 0.        , 0.        ,
        0.30652086, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30652086, 0.        ]])

In [89]:
# Text Representation using Word Embeddings

import spacy

nlp = spacy.load("en_core_web_lg")

In [93]:
doc = nlp("Dog cat eat banana asjdk")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov)

Dog Vector: True OOV: False
cat Vector: True OOV: False
eat Vector: True OOV: False
banana Vector: True OOV: False
asjdk Vector: False OOV: True


In [94]:
doc[0].vector.shape

(300,)

In [102]:
base_token = nlp("bread")
doc.vector.shape

(300,)

In [103]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text} :", token.similarity(base_token))

bread <-> bread : 1.0
sandwich <-> bread : 0.6341067010130894
burger <-> bread : 0.47520687769584247
car <-> bread : 0.06451532596945217
tiger <-> bread : 0.04764611272488976
human <-> bread : 0.2151154210812192
wheat <-> bread : 0.615036141030184


In [104]:
def printsimilarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text} :", token.similarity(base_token))

In [105]:
printsimilarity("iphone", "samsung apple iphone car human cat")

samsung <-> iphone : 0.6708590303423401
apple <-> iphone : 0.4387907748060368
iphone <-> iphone : 1.0
car <-> iphone : 0.25586206064273004
human <-> iphone : 0.009960269592946424
cat <-> iphone : 0.11430552710784589


In [106]:
# Word vectors in Gensim

import gensim.downloader as api

In [107]:
wv = api.load("word2vec-google-news-300")

In [113]:
wv.similarity(w1="great", w2="good")

0.72915095

In [115]:
wv.most_similar("good")

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348341941833),
 ('nice', 0.6836092472076416),
 ('excellent', 0.6442928910255432),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728850364685),
 ('solid', 0.5806034207344055),
 ('lousy', 0.5764203071594238)]

In [116]:
wv.most_similar(positive=["king","woman"], negative=["man"])

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]