# Lab 8. Advanced Text Processing

### This lab will cover:


1. Use of Spacy


### Install required packages

In [1]:
!pip install -U spacy==2.2.3



In [2]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.5 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
!pip install spacy_langdetect



In [4]:
!pip install spacy-readability



### Let's import spacy and load english language model

In [5]:
import spacy
from spacy import displacy
from spacy_readability import Readability
import spacy_langdetect
import en_core_web_sm


In [6]:
print(spacy.__version__)

2.2.3


In [7]:
nlp = en_core_web_sm.load()

### 8.1. Let's do some text processing using spacy

In [8]:
# Process a text
doc = nlp("This is a really long sentence with many words and arguments.")

In [9]:
doc.text

'This is a really long sentence with many words and arguments.'

In [10]:
first_token = doc[0]
second_token = doc[1]

In [11]:
first_token

This

In [12]:
second_token

is

In [13]:
some_tokens=doc[5:8]

In [14]:
some_tokens

sentence with many

### 8.2. Let's extract some linguistic entities

In [15]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
AppleText = nlp(text)

In [16]:
AppleText[4]

Apple

In [17]:
AppleText[4].text

'Apple'

In [18]:
AppleText[4].pos_

'PROPN'

In [19]:
AppleText[4].dep_

'nsubj'

In [20]:
for token in AppleText:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      punct     
official    NOUN      ccomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [21]:
displacy.render(AppleText, style="dep")

### 8.3 Named Entity Recognition

In [22]:
# Iterate over the predicted entities
for ent in AppleText.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [23]:
displacy.render(AppleText, style="ent")

In [24]:
obamaSpeech="My fellow citizens:I stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors. I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition.Forty-four Americans have now taken the presidential oath. The words have been spoken during rising tides of prosperity and the still waters of peace. Yet, every so often the oath is taken amidst gathering clouds and raging storms. At these moments, America has carried on not simply because of the skill or vision of those in high office, but because We the People have remained faithful to the ideals of our forbearers, and true to our founding documents.So it has been. So it must be with this generation of Americans.That we are in the midst of crisis is now well understood. Our nation is at war, against a far-reaching network of violence and hatred. Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some, but also our collective failure to make hard choices and prepare the nation for a new age. Homes have been lost; jobs shed; businesses shuttered. Our health care is too costly; our schools fail too many; and each day brings further evidence that the ways we use energy strengthen our adversaries and threaten our planet."

In [25]:
# Process the text
obamaSpeechProcessed = nlp(obamaSpeech)

In [26]:
displacy.render(obamaSpeechProcessed, style="ent")

### 8.4 Let's compute text similarity

In [27]:
doc1 = nlp("Apple manufactures mobile phones")
doc2 = nlp("Samsung also manufactures mobile phones")
doc1.similarity(doc2)

  "__main__", mod_spec)


0.7999694922967239

In [28]:
doc1 = nlp("Apple manufactures mobile phones")
doc3 = nlp("This is a text about kangaroos")
doc1.similarity(doc3)

  "__main__", mod_spec)


0.4047539563901164

In [29]:
doc1 = nlp("Apple manufactures mobile phones")
doc4 = nlp("Apple produces mobile phones")
doc1.similarity(doc4)

  "__main__", mod_spec)


0.9259198889238178

### 8.5 Let's do language detection

In [30]:
from spacy_langdetect import LanguageDetector

nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [31]:

text = 'This is an english text.'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999980396541821}
This is an english text. {'language': 'en', 'score': 0.9999937210821516}


In [32]:
text = 'This is an english text. Also some texto en castellano '
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999974430092486}
This is an english text. {'language': 'en', 'score': 0.9999985930259829}
Also some texto en castellano {'language': 'es', 'score': 0.8571387429286993}


In [33]:
text = 'Hola, Cómo estás?'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'es', 'score': 0.9999957887632133}
Hola, Cómo estás? {'language': 'es', 'score': 0.9999967477976015}


In [34]:
text = '我是西班牙人'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'ko', 'score': 0.8571414769575563}
我是西班牙人 {'language': 'ko', 'score': 0.9999980584531791}


### 8.6 Let's compute the readibility of the text

In [35]:

nlp.add_pipe(Readability())

doc = nlp("I am some really difficult text to read because I use obnoxiously large words.")

print(doc._.flesch_kincaid_grade_level)
print(doc._.flesch_kincaid_reading_ease)
print(doc._.dale_chall)
print(doc._.smog)
print(doc._.coleman_liau_index)
print(doc._.automated_readability_index)
print(doc._.forcast)

8.412857142857145
59.68214285714288
7.714471428571429
0
8.96571428571428
7.1014285714285705
0


In [36]:
doc = nlp("Birds are animals than can fly.")

print(doc._.flesch_kincaid_grade_level)
print(doc._.flesch_kincaid_reading_ease)
print(doc._.dale_chall)
print(doc._.smog)
print(doc._.coleman_liau_index)
print(doc._.automated_readability_index)
print(doc._.forcast)

2.4833333333333343
87.94500000000002
0.2976
0
3.7666666666666657
1.1950000000000003
0


# Further References:

#### Free tutorials online

https://course.spacy.io/en/

http://spacy.pythonhumanities.com/intro.html
