In [34]:
import spacy
from spacy.lang.en import English
from spacy import displacy
import pandas as pd

In [35]:
nlp = spacy.load('en_core_web_md')

In [37]:
test_sent = "Pakistan got independence in 1947. Karachi, Lahore and Islamabad are few of the major cities of Pakistan."

parsed_sent = nlp(test_sent)
spacy.displacy.render(parsed_sent, style='ent',jupyter=True)

In [38]:
df_token = pd.DataFrame()

for i, token in enumerate(parsed_sent):
    df_token.loc[i, 'text'] = token.text
    df_token.loc[i, 'lemma'] = token.lemma_,
    df_token.loc[i, 'pos'] = token.pos_
    df_token.loc[i, 'tag'] = token.tag_
    df_token.loc[i, 'dep'] = token.dep_
    #df_token.loc[i, 'shape'] = token.shape_
    #df_token.loc[i, 'is_alpha'] = token.is_alpha
    df_token.loc[i, 'is_stop'] = token.is_stop
    
print(df_token)

            text            lemma    pos  tag    dep is_stop
0       Pakistan         Pakistan  PROPN  NNP  nsubj   False
1            got           (get,)   VERB  VBD   ROOT   False
2   independence  (independence,)   NOUN   NN   dobj   False
3             in            (in,)    ADP   IN   prep    True
4           1947          (1947,)    NUM   CD   pobj   False
5              .             (.,)  PUNCT    .  punct   False
6        Karachi       (Karachi,)  PROPN  NNP  nsubj   False
7              ,             (,,)  PUNCT    ,  punct   False
8         Lahore        (Lahore,)  PROPN  NNP   conj   False
9            and           (and,)  CCONJ   CC     cc    True
10     Islamabad     (Islamabad,)  PROPN  NNP   conj   False
11           are            (be,)    AUX  VBP   ROOT    True
12           few           (few,)    ADJ   JJ   attr    True
13            of            (of,)    ADP   IN   prep    True
14           the           (the,)    DET   DT    det    True
15         major        

In [39]:
import nltk
from nltk.stem import PorterStemmer
porter = PorterStemmer()

In [40]:
print(porter.stem("stabilize"))
print(porter.stem("destabilize"))
print(porter.stem("football"))
print(porter.stem("studies"))
print(porter.stem("studying"))
print(porter.stem("beautiful"))
print(porter.stem("beauty"))

stabil
destabil
footbal
studi
studi
beauti
beauti


In [41]:
test_token = "stablize destablize football studies studying beautiful beauty"
parsed_sent = nlp(test_token)
for token in parsed_sent:
    print(token.text, token.lemma_)

stablize stablize
destablize destablize
football football
studies study
studying study
beautiful beautiful
beauty beauty


Textacy

In [42]:
import textacy

In [43]:
sentence = 'The author is writing a new book.'
patterns = [{"POS": "AUX"}, {"POS": "VERB"}]

In [44]:
doc = nlp(sentence)
lists = textacy.extract.token_matches(doc, patterns)
for list in lists:
    print(list.text)

is writing


In [45]:
sentence2 = "The talk will introduce reader about use cases of Natural Language Processing in \
            Fintech. It will make use of interesting examples along the way."
doc = nlp(sentence2)
lists = textacy.extract.token_matches(doc, patterns)
for list in lists:
    print(list.text)

will introduce
will make


In [46]:
for n in doc.noun_chunks:
    print(n)

The talk
reader
use
cases
Natural Language Processing
Fintech
It
use
interesting examples
the way


In [47]:
abc3 = textacy.extract.basics.entities(doc)
for a in abc3:
    print(a)

Natural Language Processing
Fintech


In [48]:
d1 = "My favorite dog is fluffy and tan"
d2 = "The dog is brown and cat is brown"
d3 = "My favorite hat is brown and coat is pink"

In [49]:
docs = [d1.lower(), d2.lower(), d3.lower()]
print(docs)

['my favorite dog is fluffy and tan', 'the dog is brown and cat is brown', 'my favorite hat is brown and coat is pink']


In [51]:
stopword = set("is and the my".split())
print(stopword)
s = []
for doc in docs: 
    s_list = [word for word in doc.split() if word not in stopword]
    print(s_list)
    str_ = ' '.join(s_list)   
    s.append(str_) 
print(s) 

{'my', 'the', 'is', 'and'}
['favorite', 'dog', 'fluffy', 'tan']
['dog', 'brown', 'cat', 'brown']
['favorite', 'hat', 'brown', 'coat', 'pink']
['favorite dog fluffy tan', 'dog brown cat brown', 'favorite hat brown coat pink']


In [52]:
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
cnt_vectorizer = CountVectorizer() #(binary=True)
cnt_vectorizer.fit(s)
print(cnt_vectorizer.vocabulary_)

{'favorite': 4, 'dog': 3, 'fluffy': 5, 'tan': 8, 'brown': 0, 'cat': 1, 'hat': 6, 'coat': 2, 'pink': 7}


In [55]:
print(cnt_vectorizer.get_feature_names_out())

['brown' 'cat' 'coat' 'dog' 'favorite' 'fluffy' 'hat' 'pink' 'tan']


In [57]:
vec1 = cnt_vectorizer.transform(s).toarray()
print(vec1.shape)
print(vec1)

(3, 9)
[[0 0 0 1 1 1 0 0 1]
 [2 1 0 1 0 0 0 0 0]
 [1 0 1 0 1 0 1 1 0]]


In [61]:
bigram_vectorizer = CountVectorizer(binary=True,ngram_range=(1, 2))
bigram_vectorizer.fit(s)
print(bigram_vectorizer.get_feature_names_out())

['brown' 'brown cat' 'brown coat' 'cat' 'cat brown' 'coat' 'coat pink'
 'dog' 'dog brown' 'dog fluffy' 'favorite' 'favorite dog' 'favorite hat'
 'fluffy' 'fluffy tan' 'hat' 'hat brown' 'pink' 'tan']


In [62]:
vec2 = bigram_vectorizer.transform(s).toarray()
print(vec2.shape)
print(vec2)

(3, 19)
[[0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 0 1]
 [1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 0]]


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
tfidf_vectorizer = TfidfVectorizer(stop_words=['is', 'and', 'the', 'my']) #,binary=True)
vec4 = tfidf_vectorizer.fit_transform(s).toarray()
print(vec4)
print(tfidf_vectorizer.get_feature_names_out())

[[0.         0.         0.         0.42804604 0.42804604 0.5628291
  0.         0.         0.5628291 ]
 [0.77100584 0.50689001 0.         0.38550292 0.         0.
  0.         0.         0.        ]
 [0.37302199 0.         0.49047908 0.         0.37302199 0.
  0.49047908 0.49047908 0.        ]]
['brown' 'cat' 'coat' 'dog' 'favorite' 'fluffy' 'hat' 'pink' 'tan']


In [66]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun FAC
Recode ORG
earlier this week DATE


In [67]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
