In [1]:
# -U, --upgrade  
# !pip install -U spacy

In [2]:
# !pip install -U spacy-lookups-data

In [3]:
# -m mod : run library module as a script (terminates option list)
# !python -m spacy download en_core_web_sm

# Tokenization

In [4]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
help(nlp)

Help on English in module spacy.lang.en object:

class English(spacy.language.Language)
 |  English(vocab: Union[spacy.vocab.Vocab, bool] = True, *, max_length: int = 1000000, meta: Dict[str, Any] = {}, create_tokenizer: Union[Callable[[ForwardRef('Language')], Callable[[str], spacy.tokens.doc.Doc]], NoneType] = None, batch_size: int = 1000, **kwargs) -> None
 |  
 |  A text-processing pipeline. Usually you'll load this once per process,
 |  and pass the instance around your application.
 |  
 |  Defaults (class): Settings, data and factory methods for creating the `nlp`
 |      object and processing pipeline.
 |  lang (str): IETF language code, such as 'en'.
 |  
 |  DOCS: https://spacy.io/api/language
 |  
 |  Method resolution order:
 |      English
 |      spacy.language.Language
 |      builtins.object
 |  
 |  Data and other attributes defined here:
 |  
 |  Defaults = <class 'spacy.lang.en.EnglishDefaults'>
 |      Language data defaults, available via Language.Defaults. Can be


In [7]:
text = "Apple is looking for buying. a U.K. startup for $1 billion"

In [8]:
doc = nlp(text)
doc

Apple is looking for buying. a U.K. startup for $1 billion

In [9]:
# '$' ve '.' işaretine dikkat

for i,token in enumerate(doc):
    print(i, token)

0 Apple
1 is
2 looking
3 for
4 buying
5 .
6 a
7 U.K.
8 startup
9 for
10 $
11 1
12 billion


## Parts of Speech [POS] Tagging (Kelime Türü Etiketleme)


- Alphabetical listing

- ADJ: adjective
- ADP: adposition
- ADV: adverb
- AUX: auxiliary
- CCONJ: coordinating conjunction
- DET: determiner
- INTJ: interjection
- NOUN: noun
- NUM: numeral
- PART: particle
- PRON: pronoun
- PROPN: proper noun
- PUNCT: punctuation
- SCONJ: subordinating conjunction
- SYM: symbol
- VERB: verb
- X: other

In [10]:
def POS(pos):
    
    """
    How to Using:
    
    for token in doc:

        print(POS(token.pos_))
    
    """
    
    import spacy
    
    pos_dicts =  \
        {"ADJ": "sıfat",
        "ADP": "adpozisyon",
        "ADV": "zarf",
        "AUX": "yardımcı",
        "CCONJ": "koordine edici bağlantı",
        "DET": "belirleyici",
        "INTJ": "ünlem",
        "NOUN": "isim",
        "NUM": "sayı",
        "PART": "parçacık",
        "PRON": "zamir",
        "PROPN": "özel isim",
        "PUNCT": "noktalama",
        "SCONJ": "bağımlı bağlaç",
        "SYM": "sembol",
        "VERB": "fiil",
        "X": "diğer"}
    
    
    return pos_dicts[pos].upper()

In [11]:
# isim, fiil, sıfat, bağlaç vb.

for token in doc:
    
    print(f"Text : {token.text:{15}}, POS : ({token.pos_:{7}}, {POS(token.pos_)})")

Text : Apple          , POS : (PROPN  , ÖZEL ISIM)
Text : is             , POS : (AUX    , YARDIMCI)
Text : looking        , POS : (VERB   , FIIL)
Text : for            , POS : (ADP    , ADPOZISYON)
Text : buying         , POS : (NOUN   , ISIM)
Text : .              , POS : (PUNCT  , NOKTALAMA)
Text : a              , POS : (DET    , BELIRLEYICI)
Text : U.K.           , POS : (PROPN  , ÖZEL ISIM)
Text : startup        , POS : (VERB   , FIIL)
Text : for            , POS : (ADP    , ADPOZISYON)
Text : $              , POS : (SYM    , SEMBOL)
Text : 1              , POS : (NUM    , SAYI)
Text : billion        , POS : (NUM    , SAYI)


# Visualization

In [12]:
doc

Apple is looking for buying. a U.K. startup for $1 billion

In [13]:
from spacy import displacy

In [14]:
displacy.render(doc, style = "ent")

In [15]:
displacy.render(doc, style = "dep",options = {"distance" : 100, "color" : "purple"})

## Named Entity Recognition NER

In [16]:
doc

Apple is looking for buying. a U.K. startup for $1 billion

In [17]:
doc.ents

(Apple, U.K., $1 billion)

In [18]:
for ent in doc.ents:

    print(f"Text : {ent.text:{15}}, NER : {ent.label_}")

Text : Apple          , NER : ORG
Text : U.K.           , NER : GPE
Text : $1 billion     , NER : MONEY


In [19]:
doc2 = nlp("Apple is looking for buying a UK startup for $1 billion in 2020")
doc2

Apple is looking for buying a UK startup for $1 billion in 2020

In [20]:
from spacy import displacy

displacy.render(doc2,style = "ent")

## Sentence Segmentation

In [21]:
text_1 = "Apple is looking for buying a UK startup. Government has given permission"
text_2 = "Apple is looking for buying a U.K. startup. Government has given permission"

In [22]:
import spacy

In [23]:
nlp = spacy.load("en_core_web_sm")

In [24]:
doc_1 = nlp(text_1)
doc_2 = nlp(text_2)

In [25]:
for sent in doc_1.sents:
    
    print(sent.text)

Apple is looking for buying a UK startup.
Government has given permission


In [26]:
# Noktalarda takılmadan ayırabildi.

for sent in doc_2.sents:

    print(sent.text)

Apple is looking for buying a U.K. startup.
Government has given permission


## Phrase Matching

In [27]:
from spacy.matcher import Matcher
from spacy.tokens import Span
import spacy
from spacy import displacy

In [28]:
nlp = spacy.load("en_core_web_sm")

In [29]:
text = "Hello, world! hello world"

In [30]:
doc_3 = nlp(text)
doc_3

Hello, world! hello world

In [31]:
for i in doc_3.sents:
    print(i.text,)

Hello, world!
hello world


In [32]:
for i in doc_3.ents:
    print(i.text,i.label_)

In [33]:
pattern = [[ {"LOWER" : "hello"}, {"IS_PUNCT" : True}, {"LOWER" : "world"} ]]

In [34]:
matcher = Matcher(nlp.vocab)

matcher.add("hw",pattern)

In [35]:
# MATCH_ID , START , STOP

matches = matcher(doc_3)
matches

[(17790654416186116455, 0, 3)]

In [36]:
for match_id ,start, stop in matches:
    
    string_id = nlp.vocab.strings[match_id]
    span = doc_3[start:stop].text
    print("Match Id : {}\nString Id : {}\nStart : {}\nStop : {}\nSpan : {}".format(match_id,
                                                                                   string_id,
                                                                                   start,
                                                                                   stop,
                                                                                   span))

Match Id : 17790654416186116455
String Id : hw
Start : 0
Stop : 3
Span : Hello, world


## Regular Expression

In [37]:
text = "my phone number is 123. ohh its wrong one. correct one is 1234567890"

In [38]:
import re

In [39]:
re.search(r"\d",text)

<re.Match object; span=(19, 20), match='1'>

In [40]:
re.search(r"\d+",text)

<re.Match object; span=(19, 22), match='123'>

In [41]:
re.search(r"\d{10}",text)

<re.Match object; span=(58, 68), match='1234567890'>

In [42]:
re.search(r"\d{3}",text)

<re.Match object; span=(19, 22), match='123'>

In [43]:
for i in re.findall(r"\w+",text):
    print(i)

my
phone
number
is
123
ohh
its
wrong
one
correct
one
is
1234567890


In [44]:
re.findall(r"ph..",text),re.findall(r"ph...",text)


(['phon'], ['phone'])

In [45]:
re.findall(r"[\d]+", text)

['123', '1234567890']

In [46]:
re.findall(r"[\d]{3,10}",text)

['123', '1234567890']

In [47]:
re.findall(r"\d{3,10}",text)

['123', '1234567890']

In [48]:
re.findall(r"[143]+",text)

['1', '3', '1', '34']

In [49]:
re.findall(r"[A-za-z]+",text),re.findall(r"[^A-za-z. ]+",text)

(['my',
  'phone',
  'number',
  'is',
  'ohh',
  'its',
  'wrong',
  'one',
  'correct',
  'one',
  'is'],
 ['123', '1234567890'])

In [50]:
re.findall(r"c......",text)

['correct']

In [51]:
re.findall(r"[c.....]+",text)

['.', '.', 'c', 'c']

## Preprocessing Pipeline in Spacy

In [52]:
import spacy

from warnings import filterwarnings
filterwarnings("ignore")

In [53]:
texts = ["net income was $9.4 million compared to the prior year of 2.7$ million",
         "revenue exceeds twelve billion dollars with a loss of $1b."]

In [54]:
nlp = spacy.load("en_core_web_sm")

In [55]:
# Text ---> nlp pipe [tokenizer, tagger , parser , ner , ....] ---> Doc

docs = nlp.pipe(texts, disable = ["tagger","parser"])

In [56]:
%%time 

docs = nlp.pipe(texts, disable = ["tagger","parser"])

for doc in docs:
    
    for ent in doc.ents:
        print(f"Text : {ent.text:{25}}, ENT : {ent.label_}")
    print()

Text : $9.4 million             , ENT : MONEY
Text : the prior year           , ENT : DATE
Text : 2.7$ million             , ENT : MONEY

Text : twelve billion dollars   , ENT : MONEY
Text : 1b                       , ENT : MONEY

CPU times: user 6.81 ms, sys: 717 µs, total: 7.53 ms
Wall time: 7.08 ms


In [57]:
%%time

docs = nlp.pipe(texts)

for doc in docs:
    for ent in doc.ents:
        print(f"Text : {ent.text:{25}}, Ent : {ent.label_}")
        
    print()
    
# Süreleri karşılaştır Biz bazı süreçleri yukarda görmezden geldik !!! Süre burda Yükseldi !!

Text : $9.4 million             , Ent : MONEY
Text : the prior year           , Ent : DATE
Text : 2.7$ million             , Ent : MONEY

Text : twelve billion dollars   , Ent : MONEY
Text : 1b                       , Ent : MONEY

CPU times: user 8.35 ms, sys: 1.21 ms, total: 9.56 ms
Wall time: 8.77 ms


## Hashtags and Emoji Detection

In [58]:
import spacy
from spacy.matcher import Matcher
from spacy import displacy

In [59]:
nlp = spacy.load("en_core_web_sm")

In [60]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

In [61]:
pos = [[{"ORTH" : emoji}] for emoji in pos_emoji]
neg = [[{"ORTH" : emoji}] for emoji in neg_emoji]

In [62]:
hashtags = [[ {"TEXT" : "#"}, {"IS_ASCII" : True} ]]

In [63]:
matches = Matcher(nlp.vocab)

matches.add("hashtags",hashtags)
matches.add("pos",pos)
matches.add("neg",neg)

In [64]:
doc = nlp("Hello guys 😀😂 😞 #kgptalkie")
doc

Hello guys 😀😂 😞 #kgptalkie

In [65]:
matches(doc)

[(12715761412317525803, 2, 3),
 (12715761412317525803, 3, 4),
 (425, 4, 5),
 (12116372008979164942, 5, 7)]

In [66]:
for match_id,start,stop in matches(doc):
    
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:stop]
    
    print("Match Id : {}\nString Id : {}\nStart : {}\nStop : {}\nSpan : {}".format(match_id,
                                                                                   string_id,
                                                                                   start,
                                                                                   stop,
                                                                                   span))
    print()
    
    

Match Id : 12715761412317525803
String Id : pos
Start : 2
Stop : 3
Span : 😀

Match Id : 12715761412317525803
String Id : pos
Start : 3
Stop : 4
Span : 😂

Match Id : 425
String Id : neg
Start : 4
Stop : 5
Span : 😞

Match Id : 12116372008979164942
String Id : hashtags
Start : 5
Stop : 7
Span : #kgptalkie



In [67]:
text = "Hello, world! hello world"

doc_text = nlp(text)


# Var yada yok 'is_punct' için

pattern = [[{'LOWER': 'hello'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'world'}]]

matches = Matcher(nlp.vocab)

matches.add("hw!",pattern)

In [68]:
for match_id,start,stop in matches(doc_text):
    
    string_id = nlp.vocab.strings[match_id]
    span = doc_text[start:stop]
    
    print("Match Id : {}\nString Id : {}\nStart : {}\nStop : {}\nSpan : {}".format(match_id,
                                                                                   string_id,
                                                                                   start,
                                                                                   stop,
                                                                                   span))
    
    print()

Match Id : 15881058849434767646
String Id : hw!
Start : 0
Stop : 3
Span : Hello, world

Match Id : 15881058849434767646
String Id : hw!
Start : 4
Stop : 6
Span : hello world

