# Lab 7.2. Advanced Text Processing

## This lab will cover:


1. Use of Spacy


## 0. Install required packages

In [None]:
!pip install -U spacy

In [None]:
!python -m spacy download en_core_web_sm

In [44]:
!pip install spacy-langdetect

Collecting spacy-langdetect
  Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)
Collecting pytest
  Downloading pytest-6.0.2-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 3.4 MB/s eta 0:00:01
[?25hCollecting langdetect==1.0.7
  Downloading langdetect-1.0.7.zip (998 kB)
[K     |████████████████████████████████| 998 kB 9.8 MB/s eta 0:00:01
Collecting toml
  Downloading toml-0.10.1-py2.py3-none-any.whl (19 kB)
Collecting iniconfig
  Downloading iniconfig-1.0.1-py3-none-any.whl (4.2 kB)
Collecting py>=1.8.2
  Downloading py-1.9.0-py2.py3-none-any.whl (99 kB)
[K     |████████████████████████████████| 99 kB 8.8 MB/s  eta 0:00:01
[?25hCollecting pluggy<1.0,>=0.12
  Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting more-itertools>=4.0.0
  Downloading more_itertools-8.5.0-py3-none-any.whl (44 kB)
[K     |████████████████████████████████| 44 kB 3.9 MB/s  eta 0:00:01
Building wheels for collected packages: langdetect
  Building wheel f

In [51]:
!pip install spacy-readability

Collecting spacy-readability
  Downloading spacy_readability-1.4.1-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 1.8 MB/s eta 0:00:01
[?25hCollecting syllapy<1,>=0
  Downloading syllapy-0.7.1-py3-none-any.whl (27 kB)
Collecting ujson<2.0,>=1.35
  Downloading ujson-1.35.tar.gz (192 kB)
[K     |████████████████████████████████| 192 kB 5.2 MB/s eta 0:00:01
Building wheels for collected packages: ujson
  Building wheel for ujson (setup.py) ... [?25ldone
[?25h  Created wheel for ujson: filename=ujson-1.35-cp37-cp37m-linux_x86_64.whl size=69583 sha256=6f55a903e3c966632e3a6fed109fa625d07721a10304677eff6bcbc924e1396f
  Stored in directory: /home/jovyan/.cache/pip/wheels/55/e8/7e/e36b183f3e654b73fc04eb1b656ad3c2773077dd531cb35c4d
Successfully built ujson
Installing collected packages: ujson, syllapy, spacy-readability
Successfully installed spacy-readability-1.4.1 syllapy-0.7.1 ujson-1.35


## 1. Let's import spacy and load english language model

In [53]:
import spacy
from spacy import displacy
from spacy_readability import Readability


In [54]:
# Create the nlp object
nlp = spacy.load("en_core_web_sm")

## 2. Let's do some text processing using spacy

In [23]:
# Process a text
doc = nlp("This is a really long sentence with many words and arguments.")

In [24]:
doc.text

'This is a really long sentence with many words and arguments.'

In [25]:
first_token = doc[0]
second_token = doc[1]

In [26]:
first_token

This

In [27]:
second_token

is

In [28]:
some_tokens=doc[5:8]

In [29]:
some_tokens

sentence with many

## 3. Let's extract some linguistic entities

In [30]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
AppleText = nlp(text)

In [31]:
AppleText[4]

Apple

In [32]:
AppleText[4].text

'Apple'

In [33]:
AppleText[4].pos_

'PROPN'

In [34]:
AppleText[4].dep_

'nsubj'

In [35]:
for token in AppleText:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      ccomp     
official    ADJ       dobj      
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [36]:
displacy.render(AppleText, style="dep")

## 4. Named Entity Recognition

In [37]:
# Iterate over the predicted entities
for ent in AppleText.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [38]:
displacy.render(AppleText, style="ent")

## 5. Let's compute text similarity

In [41]:
doc1 = nlp("Apple manufactures mobile phones")
doc2 = nlp("Samsung also manufactures mobile phones")
doc1.similarity(doc2)

  This is separate from the ipykernel package so we can avoid doing imports until


0.7463466493989511

In [42]:
doc1 = nlp("Apple manufactures mobile phones")
doc2 = nlp("This is a text about kangaroos")
doc1.similarity(doc2)

  This is separate from the ipykernel package so we can avoid doing imports until


0.4482861727480409

## 6. Let's do language detection

In [47]:
from spacy_langdetect import LanguageDetector

nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
text = 'This is an english text.'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.999996849812629}
This is an english text. {'language': 'en', 'score': 0.9999984805872498}


In [48]:
text = 'This is an english text. Also some texto en castellano '
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999977116350056}
This is an english text. {'language': 'en', 'score': 0.9999975603692192}
Also some texto en castellano {'language': 'es', 'score': 0.5714270296109615}


In [49]:
text = 'Hola, Cómo estás?'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'es', 'score': 0.9999971454663774}
Hola, Cómo estás? {'language': 'es', 'score': 0.9999987453443455}


## 7. Let's compute the readibility of the text

In [57]:

nlp.add_pipe(Readability())

doc = nlp("I am some really difficult text to read because I use obnoxiously large words.")

print(doc._.flesch_kincaid_grade_level)
print(doc._.flesch_kincaid_reading_ease)
print(doc._.dale_chall)
print(doc._.smog)
print(doc._.coleman_liau_index)
print(doc._.automated_readability_index)
print(doc._.forcast)

8.412857142857145
59.68214285714288
7.714471428571429
0
8.96571428571428
7.1014285714285705
0


In [58]:
doc = nlp("Birds are animals than can fly.")

print(doc._.flesch_kincaid_grade_level)
print(doc._.flesch_kincaid_reading_ease)
print(doc._.dale_chall)
print(doc._.smog)
print(doc._.coleman_liau_index)
print(doc._.automated_readability_index)
print(doc._.forcast)

2.4833333333333343
87.94500000000002
0.2976
0
3.7666666666666657
1.1950000000000003
0
