In [1]:
# importy
import nltk
import wikipedia
from collections import Counter
from string import punctuation

### POS tagging

In [2]:
text = None
with open('text.txt', 'r') as file:
    text = file.read()
tokens = nltk.word_tokenize(text)
pos = nltk.pos_tag(tokens)

count = Counter(pos)
sort_pos = sorted(count.items(), key=lambda count:count[1], reverse=True)
print('POS Top 20')
print(sort_pos[:20])
print()

nopunc = [token for token in tokens if token not in punctuation]
tagged = nltk.pos_tag(nopunc)

count = Counter(tagged)
sort_tagged = sorted(count.items(), key=lambda count:count[1], reverse=True)
print('POS Top 20 without punctuation')
print(sort_tagged[:20])

POS Top 20
[((',', ','), 753), (('.', '.'), 731), (('the', 'DT'), 684), (('and', 'CC'), 488), (('to', 'TO'), 435), (('of', 'IN'), 377), (('in', 'IN'), 254), (('a', 'DT'), 218), (('is', 'VBZ'), 203), (('this', 'DT'), 143), (('will', 'MD'), 138), (('we', 'PRP'), 137), (('for', 'IN'), 123), (('We', 'PRP'), 111), (('I', 'PRP'), 107), (('our', 'PRP$'), 94), (('be', 'VB'), 92), (('that', 'IN'), 91), (('are', 'VBP'), 90), (('have', 'VBP'), 86)]

POS Top 20 without punctuation
[(('the', 'DT'), 684), (('and', 'CC'), 488), (('to', 'TO'), 435), (('of', 'IN'), 377), (('in', 'IN'), 254), (('a', 'DT'), 218), (('is', 'VBZ'), 203), (('this', 'DT'), 143), (('will', 'MD'), 138), (('we', 'PRP'), 137), (('for', 'IN'), 123), (('We', 'PRP'), 111), (('I', 'PRP'), 107), (('our', 'PRP$'), 94), (('be', 'VB'), 92), (('that', 'IN'), 91), (('are', 'VBP'), 90), (('have', 'VBP'), 86), (('Russia', 'NNP'), 79), (('must', 'MD'), 78)]


### NER with entity classification (using nltk.ne_chunk)

In [3]:
text = None
with open('text.txt', 'r') as file:
    text = file.read()
    
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)

ne_chunked = nltk.ne_chunk(tagged)
ner = {}
for entity in ne_chunked:
    if isinstance(entity, nltk.tree.Tree):
        text = " ".join([word for word, tag in entity.leaves()])
        ent = entity.label()
        ner[text] = ent
    else:
        continue

sort_ner = sorted(ner.items(), key=lambda entity: entity[1][1], reverse=True)
print('NER Top 20')
print(sort_ner[:20])

NER Top 20
[('Yekaterinburg', 'GSP'), ('Syria', 'GSP'), ('Federation Council', 'ORGANIZATION'), ('State Duma', 'ORGANIZATION'), ('Address', 'ORGANIZATION'), ('GDP', 'ORGANIZATION'), ('Particular', 'ORGANIZATION'), ('Technical', 'ORGANIZATION'), ('Crimean Bridge', 'ORGANIZATION'), ('Azov', 'ORGANIZATION'), ('Arctic', 'ORGANIZATION'), ('Spatial Development Strategy', 'ORGANIZATION'), ('Extreme North', 'ORGANIZATION'), ('Popular', 'ORGANIZATION'), ('Medical', 'ORGANIZATION'), ('Volunteers', 'ORGANIZATION'), ('NPOs', 'ORGANIZATION'), ('Sciences', 'ORGANIZATION'), ('Council', 'ORGANIZATION'), ('Science', 'ORGANIZATION')]


### NER with custom patterns

In [4]:
text = None
with open('text.txt', 'r') as file:
    text = file.read()
    
text_pos = nltk.pos_tag(nltk.word_tokenize(text))
grammar = "NP: {<DT>?<JJ>*<NN|NNS>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(text_pos)
custom_ner = {}
for entity in result:
    if isinstance(entity, nltk.tree.Tree):
        text = " ".join([word for word, tag in entity.leaves()])
        ent = entity.label()
        custom_ner[text] = ent
    else:
        continue
sort_custom_ner = sorted(custom_ner.items(), key=lambda entity: entity[1][1], reverse=True)
print('Custom NER Top 20')
print(sort_custom_ner[:20])

Custom NER Top 20
[('Citizens', 'NP'), ('members', 'NP'), ('special landmark', 'NP'), ('event', 'NP'), ('the times', 'NP'), ('the choices', 'NP'), ('every step', 'NP'), ('the future', 'NP'), ('country', 'NP'), ('decades', 'NP'), ('points', 'NP'), ('time', 'NP'), ('ability', 'NP'), ('new territories', 'NP'), ('build cities', 'NP'), ('conquer', 'NP'), ('space', 'NP'), ('major discoveries', 'NP'), ('This unwavering forward-looking drive', 'NP'), ('traditions', 'NP')]


In [11]:
text = None
with open('text.txt', 'r') as file:
    text = file.read()
    
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
entity = []
custom_ner = []
for tagged_entry in tagged:
    if(tagged_entry[1].startswith("NN") or (entity and tagged_entry[1].startswith("IN"))):
        entity.append(tagged_entry)
    else:
        if(entity) and entity[-1][1].startswith("IN"):
            entity.pop()
        if(entity and " ".join(e[0] for e in entity)[0].isupper()):
            custom_ner.append(" ".join(e[0] for e in entity))
        entity = []
print(custom_ner)
count = Counter(custom_ner)
sort_custom_ner = sorted(count.items(), key=lambda count:count[1], reverse=True)

['Citizens of Russia', 'Federation Council', 'State Duma', 'Today', 'Address', 'Russia', 'Today', 'Russia', 'Therefore', 'President', 'Colleagues', 'Poverty', 'Today', 'Government', 'People', 'Russia', 'GDP', 'Russia', 'Russia', 'Life expectancy levels', 'Today', 'Russia', 'Japan', 'France', 'Germany', 'Colleagues', 'Cities like Kazan', 'Vladivostok', 'Sochi', 'Change', 'Initiatives', 'Address', 'People', 'Today', 'Russia', 'Russiaís', 'Colleagues', 'Russia', 'Next', 'December', 'Technical Inventory Bureau', 'Colleagues', 'Russiaís', 'Crimean Bridge', 'Black Sea region', 'Europe ñ Asia-Pacific corridor', 'Kazakhstani partners', 'Baikal-Amur Mainline', 'Railway', 'Russia', 'Asia', 'Soviet Union', 'Soviet Union', 'Azov', 'Black Sea', 'Northern Sea Route', 'Russian Arctic', 'Far East', 'Northern Sea Route', 'Arctic', 'Russiaís interests', 'Russia', 'Spatial Development Strategy', 'Government', 'Russia', 'Likewise', 'Extreme North', 'Siberia', 'Russian Far East', 'Colleagues', 'A', 'GDP', 

### Custom entity classification

In [5]:
def wiki(name):
    try:
        page = wikipedia.page(name)
        summary = page.summary
    except:
        return ""
    return nltk.sent_tokenize(summary)[0]

def wikidescription(name):
    sent = wiki(name)
    if sent == "":
        return "a Thing"
    
    text_pos = nltk.pos_tag(nltk.word_tokenize(sent))
    
    grammar = "NP: {<DT>?<JJ>*<NN|NNS>}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(text_pos)
    data = {}
    for entity in result:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    str = ""
    for data in data:
        str+=data
        if (str[-1] != ' '):
            str += ' '
    return str
    

### wikipedia-based classification using nltk entities as the input

In [6]:
c = 0
for i in sort_ner:
    print(i[0], ' - ', wikidescription(i[0]))
    c+= 1
    if c == 20:
        break

Yekaterinburg  -  Екатеринбу́рг [ jɪkətʲɪrʲɪnˈburk the fourth-largest city the administrative centre east the middle the Eurasian continent the Asian side the boundary 
Syria  -  الجمهورية السورية‎ al-Jumhūrīyah al-ʻArabīyah as-Sūrīyah a country the southwest the west the north the east the south 
Federation Council  -  Сове́т Федера́ции common abbreviation Совфед the upper house the parliament 
State Duma  -  Госуда́рственная ду́ма tr 
Address  -  An address a collection information format the location a building apartment other structure a plot land political boundaries street names references other identifiers house numbers 
GDP  -  domestic product a monetary measure the market value the final goods services a period time capita reflect differences the cost living the inflation rates the countries a basis power parity differences standards nations 
Particular  -  metaphysics particulars concrete spatiotemporal entities entities properties numbers 




  lis = BeautifulSoup(html).find_all('li')


Technical  -  a Thing
Crimean Bridge  -  Крымский мост tr 
Azov  -  Азов the stress the second syllable a town kilometers mi name the town 
Arctic  -  a polar region part 
Spatial Development Strategy  -  a document 
Extreme North  -  Крайний Север a large part enormous mineral natural resources 
Popular  -  a Thing
Medical  -  the science practice the diagnosis prognosis treatment prevention disease 
Volunteers  -  a Thing
NPOs  -  a Thing
Sciences  -  Science word scientia knowledge a systematic enterprise the form testable explanations predictions roots science 
Council  -  A council a group people decisions 
Science  -  Science word scientia knowledge a systematic enterprise the form testable explanations predictions roots science 


### wikipedia-based classification using custom patterns as the input

In [7]:
c = 0
for i in sort_custom_ner:
    print(i[0], ' - ', wikidescription(i[0]))
    c+= 1
    if c == 20:
        break

Citizens  -  Citizenship the status a person the custom law a legal member a sovereign state belonging a nation 
members  -  a Thing
special landmark  -  a private college disabilities attention disorders autism 
event  -  a Thing
the times  -  a British daily national newspaper 
the choices  -  decision making 
every step  -  Every Little Step American singer Edmonds 
the future  -  The future the time 
country  -  A country a region a distinct entity political geography 
decades  -  A decade a period years 
points  -  a Thing
time  -  the indefinite continued progress existence events irreversible succession the past the future 
ability  -  a Thing
new territories  -  新界 Cantonese Yale Sān'gaai main regions 
build cities  -  SimCity a city-building simulation mobile game 
conquer  -  a Thing
space  -  Space the boundless three-dimensional extent objects events relative position direction 
major discoveries  -  The timeline the date publication possible major scientific theories disco