In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag

In [2]:
import ner

In [3]:
text = "Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."

In [4]:
sentences = sent_tokenize(text)

In [5]:
sentences[0:2]

['Machine learning is the science of getting computers to act without being explicitly programmed.',
 'In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome.']

In [6]:
words = word_tokenize(text)

In [7]:
words[0:10]

['Machine',
 'learning',
 'is',
 'the',
 'science',
 'of',
 'getting',
 'computers',
 'to',
 'act']

In [8]:
postags = pos_tag(words)

In [9]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [10]:
postags[0:20] #NN  noun,JJ adjective

[('Machine', 'NN'),
 ('learning', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('science', 'NN'),
 ('of', 'IN'),
 ('getting', 'VBG'),
 ('computers', 'NNS'),
 ('to', 'TO'),
 ('act', 'VB'),
 ('without', 'IN'),
 ('being', 'VBG'),
 ('explicitly', 'RB'),
 ('programmed', 'VBN'),
 ('.', '.'),
 ('In', 'IN'),
 ('the', 'DT'),
 ('past', 'JJ'),
 ('decade', 'NN'),
 (',', ',')]

In [11]:
word1 = "Google Incorporation Limited is starting a new office at Hyderabad"

In [12]:
word1 = word_tokenize(word1)

In [16]:
pos = pos_tag(word1)

In [17]:
pos

[('Google', 'NNP'),
 ('Incorporation', 'NNP'),
 ('Limited', 'NNP'),
 ('is', 'VBZ'),
 ('starting', 'VBG'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('office', 'NN'),
 ('at', 'IN'),
 ('Hyderabad', 'NNP')]

In [18]:
import ner

In [19]:
text = " google chief is visiting india during the month of may, they are going to setup a new R&D facility at hyderabad"

In [20]:
%pwd

'C:\\Users\\HP\\Desktop\\New folder'

In [21]:
import os

In [24]:
os.chdir('F:\\Library\\Analytics Path\\01-Python\\02-Datasets\\Text Mining NLTK')

In [25]:
import nltk 
with open('sample.txt', 'r') as f:
    sample = f.read() # data structure is one single string


sentences = nltk.sent_tokenize(sample) # list of sentences
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]#o/p is list
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]#o/p is list
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)#returns each word with corresponding pos-tag

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

entity_names = []
for tree in chunked_sentences:
    # Print results per sentence
    # print extract_entity_names(tree)

    entity_names.extend(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
print(entity_names)

['Satya Nadella', 'Microsoft Corporation Limited', 'Andhra Pradesh', 'Sundar Pichai', 'Google Incorporation Limited', 'Hyderabad', 'Mark Zukerberg', 'Facebook']


In [26]:
sentences = nltk.sent_tokenize(sample)

In [28]:
for sent in sentences:
    print(sent)

Satya Nadella is the CEO of the Microsoft Corporation Limited.
He will be visiting
back-trodden place Anatapur in Andhra Pradesh to see what he 
can do.
Sundar Pichai is the CEO of Google Incorporation Limited.
He will visit Hyderabad to inaugurate new R&D facility.
Mark Zukerberg, who is the CEO of Facebook, developed the company on someone's idea.
