# Introduction To NLP

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp("Tea is healthy and calming, don't you think?")

In [4]:
for token in doc:
    print(token)

Tea
is
healthy
and
calming
,
do
n't
you
think
?


In [5]:
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-"*50)
for token in doc:
    print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

Token 		Lemma 		Stopword
--------------------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calm		False
,		,		False
do		do		True
n't		not		True
you		-PRON-		True
think		think		False
?		?		False


In [6]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr = "LOWER")

In [7]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel', 'OnePlus 8']
patterns = [nlp(text) for text in terms]
matcher.add('TerminologyList', patterns)

In [8]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3"
              "OnePlus 8 ")

matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]


In [9]:
match_id, start, end = matches[3]
print(nlp.vocab.strings[match_id], text_doc[start:end])

TerminologyList Google Pixel


# Text classification with spacy

In [1]:
import pandas as pd
spam = pd.read_csv('./spam.csv', encoding='latin-1')
spam.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [11]:
spam = spam.rename(columns={'v1': 'label', 'v2': 'text'}, inplace= False)
spam.head(10)

Unnamed: 0,label,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [12]:
spam = spam.dropna(axis=1)

In [13]:
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
nlpbow = spacy.blank("en")

In [15]:
textcat = nlpbow.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "bow"})

In [16]:
nlpbow.add_pipe(textcat)

In [17]:
textcat.add_label("ham")
textcat.add_label("spam")

1

In [18]:
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham', 'spam': label == 'spam'}} for label in spam['label']]

In [19]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [20]:
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [21]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'parser': 0.0, 'ner': 0.0, 'tagger': 513.5710656429874}
{'parser': 0.0, 'ner': 0.0, 'tagger': 859.0025583050738}
{'parser': 0.0, 'ner': 0.0, 'tagger': 1176.2175928199836}
{'parser': 0.0, 'ner': 0.0, 'tagger': 1510.19163517229}
{'parser': 0.0, 'ner': 0.0, 'tagger': 1866.3296843353053}
{'parser': 0.0, 'ner': 0.0, 'tagger': 2181.8595039984457}
{'parser': 0.0, 'ner': 0.0, 'tagger': 2453.119332459169}
{'parser': 0.0, 'ner': 0.0, 'tagger': 2702.3588100482807}
{'parser': 0.0, 'ner': 0.0, 'tagger': 3004.55781212769}
{'parser': 0.0, 'ner': 0.0, 'tagger': 3205.1112501060443}


In [26]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    p
# Use textcat to get the scores for each doc
tagger = nlp.get_pipe('tagger')
scores, _ = tagger.predict(docs)

print(scores)

[array([37, 23, 23, 37, 11, 23, 23,  5,  5,  5,  5,  5, 37, 42, 23, 35, 37,
       23], dtype=int64), array([23, 23, 15, 11, 22, 37, 23, 23, 22], dtype=int64)]
