## FOR GOOGLE COLLAB USERS: Import Spacy

In [1]:
import spacy

## Convert text to lower case:

In [2]:
try:
    nlp = spacy.load('en_core_web_sm')
except:
    # install english language model
    !python -m spacy download en
    nlp = spacy.load('en_core_web_sm')

In [3]:
text = "Vienna is the national capital, largest city, and one of nine states of Austria. Vienna is Austria's most populous city, with about 1.9 million inhabitants"

lower_text = text.lower()
print(lower_text)

vienna is the national capital, largest city, and one of nine states of austria. vienna is austria's most populous city, with about 1.9 million inhabitants


## Word Tokenize
Tokenize sentences to get the tokens of the text i.e breaking the sentences into words.

In [None]:
text = "Vienna is the national capital, largest city, and one of nine states of Austria. Vienna is Austria's most populous city, with about 1.9 million inhabitants"

doc = nlp(text)
words = [token.text for token in doc]
print (words)

['Vienna', 'is', 'the', 'national', 'capital', ',', 'largest', 'city', ',', 'and', 'one', 'of', 'nine', 'states', 'of', 'Austria', '.', 'Vienna', 'is', 'Austria', "'s", 'most', 'populous', 'city', ',', 'with', 'about', '1.9', 'million', 'inhabitants']


## Sentence tokenize
Tokenize sentences if the there are more than 1 sentence i.e breaking the sentences to list of sentence.

In [None]:
text = "Vienna is the national capital, largest city, and one of nine states of Austria. Vienna is Austria's most populous city, with about 1.9 million inhabitants"
doc = nlp(text)

text = nlp(text)
list(text.sents)

[Vienna is the national capital, largest city, and one of nine states of Austria.,
 Vienna is Austria's most populous city, with about 1.9 million inhabitants]

## Stop-Word removal
Remove irrelevant words using nltk stop words like is,the,a etc from the sentences as they don’t carry any information.

In [None]:
text = "Vienna is the national capital, largest city, and one of nine states of Austria. Vienna is Austria's most populous city, with about 1.9 million inhabitants"
doc = nlp(text)

#remove stopwords and punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
print (words)

['Vienna', 'national', 'capital', 'largest', 'city', 'states', 'Austria', 'Vienna', 'Austria', 'populous', 'city', '1.9', 'million', 'inhabitants']


## Get word frequency
counting the word occurrence using FreqDist library


In [None]:
from collections import Counter

text = "Vienna is the national capital, largest city, and one of nine states of Austria. Vienna is Austria's most populous city, with about 1.9 million inhabitants"
doc = nlp(text)

#remove stopwords and punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
word_freq = Counter(words)
common_words = word_freq.most_common()
print (common_words)

[('Vienna', 2), ('city', 2), ('Austria', 2), ('national', 1), ('capital', 1), ('largest', 1), ('states', 1), ('populous', 1), ('1.9', 1), ('million', 1), ('inhabitants', 1)]


## Part of Speech tags
POS tag helps us to know the tags of each word like whether a word is noun, adjective etc.

In [None]:
nlp = spacy.load('en_core_web_sm')

text = "The dogs are barking outside."

doc = nlp(text)

for token in doc:
    print (token.text, token.pos_)

The DET
dogs NOUN
are AUX
barking VERB
outside ADV
. PUNCT


## NER(Named Entity Recognition)

| Label    | Description                                          |
|----------|------------------------------------------------------|
| ORG      | Companies, agencies, institutions.                   |
| GPE      | Geopolitical entity, i.e. countries, cities, states. |
| CARDINAL | Numerals                                             |

In [None]:
text = "Vienna is the national capital, largest city, and one of nine states of Austria. Vienna is Austria's most populous city, with about 1.9 million inhabitants"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Vienna GPE
one CARDINAL
nine CARDINAL
Austria GPE
Vienna GPE
Austria GPE
about 1.9 million CARDINAL


## Word Vector Representation

In [None]:
city = nlp('Vienna')
print(city.vector.shape)
print(city.vector)

(96,)
[-1.0961218  -0.22902218 -0.4975809   0.9610567   0.7078363  -0.06464957
  0.73181033  0.7445235  -1.2915039  -0.09090728  1.4697149   0.09945044
 -1.3506333  -0.13108441 -0.7924895   0.05922782 -0.6520083  -0.35899088
  1.2024351  -0.52779263 -1.159215    0.53939533 -0.6297082   0.14621311
  0.5931368   0.03357325  0.790095    1.5684465  -0.12552348  0.29643065
  0.02728534  0.15686297  0.8964345   1.0861708  -1.2775282  -1.2620009
  0.40376115  1.0572989   0.89938     1.5239228  -1.276994    0.15016714
 -0.30887002 -0.2136845  -0.39376312 -0.93562853 -1.3808439   1.8952878
  0.61209774 -0.47402984  0.4551257  -0.812488    0.03708351 -0.24509734
 -0.5069572  -0.9935806   1.3590736  -0.6163687   0.69572055  0.5491389
 -0.5353222  -0.9912694   0.37881336 -0.41703197  1.7358744  -0.02423835
  0.11495821 -0.94645905  0.63233984 -0.79578835  0.19647892  0.08197635
  1.4766746   0.03564269  0.7181915   0.0255273  -0.4215235  -0.5941889
 -0.82184887 -1.1261017  -0.02957836 -0.55367756 

## Conclusion
voila!!! now you know the basics of NLP