## Using nltk

### Import required modules

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

### Enter the paragraph

In [2]:
doc = """According to a media report, Mumbai has the highest density of cars in India. Pune is in second place. 
         The density of private cars in Mumbai has gone up by 18% in 2 years. There are 510 cars per km of road as opposed to 
         430 cars per km in 2016. This is almost five times that of Delhi (108 cars per km). Despite having fewer cars than 
         Delhi, Mumbai is more congested due to low road space. Mumbai has 2,000 km of roads compared to the national capital, 
         which has 28,000 km of roadways. There are 10.2 lakh private cars in Mumbai. That is 28% of the total number of 
         vehicles in the city, which stands at 36 lakh. According to RTO officials, the western suburbs have the highest number 
         of registered cars (5 lakh). There are 3.3 lakh private cars in the island city and 1.7 lakh in the eastern suburbs.
         Pune has 359 cars per km and Kolkata is the third most congested city with 319 cars per km. Chennai comes in fourth 
         with 297 cars per km followed by Bangalore with 149 cars per km."""

### Tokenize the paragraph

In [3]:
word_tokens = nltk.word_tokenize(doc)
print(word_tokens)

['According', 'to', 'a', 'media', 'report', ',', 'Mumbai', 'has', 'the', 'highest', 'density', 'of', 'cars', 'in', 'India', '.', 'Pune', 'is', 'in', 'second', 'place', '.', 'The', 'density', 'of', 'private', 'cars', 'in', 'Mumbai', 'has', 'gone', 'up', 'by', '18', '%', 'in', '2', 'years', '.', 'There', 'are', '510', 'cars', 'per', 'km', 'of', 'road', 'as', 'opposed', 'to', '430', 'cars', 'per', 'km', 'in', '2016', '.', 'This', 'is', 'almost', 'five', 'times', 'that', 'of', 'Delhi', '(', '108', 'cars', 'per', 'km', ')', '.', 'Despite', 'having', 'fewer', 'cars', 'than', 'Delhi', ',', 'Mumbai', 'is', 'more', 'congested', 'due', 'to', 'low', 'road', 'space', '.', 'Mumbai', 'has', '2,000', 'km', 'of', 'roads', 'compared', 'to', 'the', 'national', 'capital', ',', 'which', 'has', '28,000', 'km', 'of', 'roadways', '.', 'There', 'are', '10.2', 'lakh', 'private', 'cars', 'in', 'Mumbai', '.', 'That', 'is', '28', '%', 'of', 'the', 'total', 'number', 'of', 'vehicles', 'in', 'the', 'city', ',', 'wh

In [4]:
 type(word_tokens)

list

### Tag Parts Of Speech (POS)

In [5]:
postags = nltk.pos_tag(word_tokens)
print(postags)

[('According', 'VBG'), ('to', 'TO'), ('a', 'DT'), ('media', 'NNS'), ('report', 'NN'), (',', ','), ('Mumbai', 'NNP'), ('has', 'VBZ'), ('the', 'DT'), ('highest', 'JJS'), ('density', 'NN'), ('of', 'IN'), ('cars', 'NNS'), ('in', 'IN'), ('India', 'NNP'), ('.', '.'), ('Pune', 'NNP'), ('is', 'VBZ'), ('in', 'IN'), ('second', 'JJ'), ('place', 'NN'), ('.', '.'), ('The', 'DT'), ('density', 'NN'), ('of', 'IN'), ('private', 'JJ'), ('cars', 'NNS'), ('in', 'IN'), ('Mumbai', 'NNP'), ('has', 'VBZ'), ('gone', 'VBN'), ('up', 'RP'), ('by', 'IN'), ('18', 'CD'), ('%', 'NN'), ('in', 'IN'), ('2', 'CD'), ('years', 'NNS'), ('.', '.'), ('There', 'EX'), ('are', 'VBP'), ('510', 'CD'), ('cars', 'NNS'), ('per', 'IN'), ('km', 'NN'), ('of', 'IN'), ('road', 'NN'), ('as', 'IN'), ('opposed', 'VBN'), ('to', 'TO'), ('430', 'CD'), ('cars', 'NNS'), ('per', 'IN'), ('km', 'NN'), ('in', 'IN'), ('2016', 'CD'), ('.', '.'), ('This', 'DT'), ('is', 'VBZ'), ('almost', 'RB'), ('five', 'CD'), ('times', 'NNS'), ('that', 'IN'), ('of', 'I

### Find words that belong to Proper Noun (NNP)

In [6]:
nouns = ne_chunk(postags)
locations = []
for x in str(nouns).split('\n'):
    if '/NNP' in x:
        locations.append(x)
        print(str(x))

  (PERSON Mumbai/NNP)
  (GPE India/NNP)
  Pune/NNP
  (GPE Mumbai/NNP)
  (GPE Delhi/NNP)
  (PERSON Delhi/NNP)
  (PERSON Mumbai/NNP)
  (PERSON Mumbai/NNP)
  (GPE Mumbai/NNP)
  (ORGANIZATION RTO/NNP)
  (PERSON Pune/NNP)
  (PERSON Kolkata/NNP)
  (PERSON Chennai/NNP)
  (ORGANIZATION Bangalore/NNP)


### Clean the data

In [7]:
type(x)

str

In [13]:
import re 
string = """(PERSON Mumbai/NNP)
  (GPE India/NNP)
  Pune/NNP
  (GPE Mumbai/NNP)
  (GPE Delhi/NNP)
  (PERSON Delhi/NNP)
  (PERSON Mumbai/NNP)
  (PERSON Mumbai/NNP)
  (GPE Mumbai/NNP)
  (ORGANIZATION RTO/NNP)
  (PERSON Pune/NNP)
  (PERSON Kolkata/NNP)
  (PERSON Chennai/NNP)
  (ORGANIZATION Bangalore/NNP)"""
regex = r'\w+/NNP'            
match = re.findall(regex, string)  
print(match) 

['Mumbai/NNP', 'India/NNP', 'Pune/NNP', 'Mumbai/NNP', 'Delhi/NNP', 'Delhi/NNP', 'Mumbai/NNP', 'Mumbai/NNP', 'Mumbai/NNP', 'RTO/NNP', 'Pune/NNP', 'Kolkata/NNP', 'Chennai/NNP', 'Bangalore/NNP']


In [9]:
p = []
[p.append(x) for x in match if x not in p] 
print (str(p))

['Mumbai/NNP', 'India/NNP', 'Pune/NNP', 'Delhi/NNP', 'RTO/NNP', 'Kolkata/NNP', 'Chennai/NNP', 'Bangalore/NNP']


## Using spacy

In [10]:
import spacy
from spacy import displacy

In [11]:
if __name__ == "__main__":
    nlp = spacy.load('en_core_web_sm')
    doc = nlp("""According to a media report, Mumbai has the highest density of cars in India. Pune is in second place. 
         The density of private cars in Mumbai has gone up by 18% in 2 years. There are 510 cars per km of road as opposed to 
         430 cars per km in 2016. This is almost five times that of Delhi (108 cars per km). Despite having fewer cars than 
         Delhi, Mumbai is more congested due to low road space. Mumbai has 2,000 km of roads compared to the national capital, 
         which has 28,000 km of roadways. There are 10.2 lakh private cars in Mumbai. That is 28% of the total number of 
         vehicles in the city, which stands at 36 lakh. According to RTO officials, the western suburbs have the highest number 
         of registered cars (5 lakh). There are 3.3 lakh private cars in the island city and 1.7 lakh in the eastern suburbs.
         Pune has 359 cars per km and Kolkata is the third most congested city with 319 cars per km. Chennai comes in fourth 
         with 297 cars per km followed by Bangalore with 149 cars per km.""")
    for entity in doc.ents:
        print(entity.text, entity.label_)

Mumbai GPE
India GPE
second ORDINAL
Mumbai GPE
18% PERCENT
2 years DATE
510 CARDINAL
430 CARDINAL
2016 DATE
almost five CARDINAL
Delhi GPE
108 CARDINAL
Delhi GPE
Mumbai GPE
Mumbai GPE
2,000 km QUANTITY
28,000 km QUANTITY
10.2 CARDINAL
Mumbai GPE
28% PERCENT
36 CARDINAL
RTO ORG
5 CARDINAL
3.3 CARDINAL
1.7 CARDINAL
Pune GPE
359 CARDINAL
Kolkata PERSON
third ORDINAL
319 CARDINAL
fourth ORDINAL
297 CARDINAL
Bangalore GPE
149 CARDINAL


In [12]:
displacy.serve(doc, style="ent")

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Using geotext

In [13]:
from geotext import GeoText
places = GeoText("""According to a media report, Mumbai has the highest density of cars in India. Pune is in second place. 
         The density of private cars in Mumbai has gone up by 18% in 2 years. There are 510 cars per km of road as opposed to 
         430 cars per km in 2016. This is almost five times that of Delhi (108 cars per km). Despite having fewer cars than 
         Delhi, Mumbai is more congested due to low road space. Mumbai has 2,000 km of roads compared to the national capital, 
         which has 28,000 km of roadways. There are 10.2 lakh private cars in Mumbai. That is 28% of the total number of 
         vehicles in the city, which stands at 36 lakh. According to RTO officials, the western suburbs have the highest number 
         of registered cars (5 lakh). There are 3.3 lakh private cars in the island city and 1.7 lakh in the eastern suburbs.
         Pune has 359 cars per km and Kolkata is the third most congested city with 319 cars per km. Chennai comes in fourth 
         with 297 cars per km followed by Bangalore with 149 cars per km.""")
cities = places.cities
countries = places.countries

In [14]:
cities, countries

(['Mumbai',
  'Pune',
  'Mumbai',
  'Delhi',
  'Delhi',
  'Mumbai',
  'Mumbai',
  'Mumbai',
  'Pune',
  'Kolkata',
  'Chennai',
  'Bangalore'],
 ['India'])

#### Removing duplicate words and getting final lists of location

In [15]:
loc = [] 
[loc.append(x) for x in cities if x not in loc] 
print (str(countries) + str(loc))

['India']['Mumbai', 'Pune', 'Delhi', 'Kolkata', 'Chennai', 'Bangalore']
