## Text Pre-processing. POS Tagging. Sentence parsing

### Task1 

In [1]:
import nltk

text = '''London is considered to be one of the world's most important global cities 
and has been termed the world's most powerful, most desirable, most influential, most visited, most expensive, 
innovative, sustainable, most investment friendly, most popular for work, and the most vegetarian friendly city in the world. 
London exerts a considerable impact upon the arts, commerce, education, entertainment, fashion, finance, healthcare, media, 
professional services, research and development, tourism and transportation. London ranks 26 out of 300 major cities for economic  
performance.'''

#### Разбиение текста на предложения

In [2]:
simple_sentences = nltk.tokenize.sent_tokenize(text)

#### Разбиение текста на слова

In [3]:
sentences = [nltk.tokenize.word_tokenize(sentence) for sentence in simple_sentences]

#### Stemmer Porter (Snowball)

In [4]:
stemmer = nltk.stem.SnowballStemmer(language="english")
print([stemmer.stem(word) for word in sentences[0]])

['london', 'is', 'consid', 'to', 'be', 'one', 'of', 'the', 'world', "'s", 'most', 'import', 'global', 'citi', 'and', 'has', 'been', 'term', 'the', 'world', "'s", 'most', 'power', ',', 'most', 'desir', ',', 'most', 'influenti', ',', 'most', 'visit', ',', 'most', 'expens', ',', 'innov', ',', 'sustain', ',', 'most', 'invest', 'friend', ',', 'most', 'popular', 'for', 'work', ',', 'and', 'the', 'most', 'vegetarian', 'friend', 'citi', 'in', 'the', 'world', '.']


#### Lemmatizer WordNet 

In [5]:
# nltk.download("wordnet") - thesaurus, ontology
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t) for t in sentences[0]])

['London', 'is', 'considered', 'to', 'be', 'one', 'of', 'the', 'world', "'s", 'most', 'important', 'global', 'city', 'and', 'ha', 'been', 'termed', 'the', 'world', "'s", 'most', 'powerful', ',', 'most', 'desirable', ',', 'most', 'influential', ',', 'most', 'visited', ',', 'most', 'expensive', ',', 'innovative', ',', 'sustainable', ',', 'most', 'investment', 'friendly', ',', 'most', 'popular', 'for', 'work', ',', 'and', 'the', 'most', 'vegetarian', 'friendly', 'city', 'in', 'the', 'world', '.']


#### Stemmer Lancaster

In [6]:
tokens = text.split(' ')
lancaster = nltk.stem.LancasterStemmer() # nltk.LancasterStemmer()
print([lancaster.stem(t) for t in tokens])

['london', 'is', 'consid', 'to', 'be', 'on', 'of', 'the', "world's", 'most', 'import', 'glob', 'city', '\nand', 'has', 'been', 'term', 'the', "world's", 'most', 'powerful,', 'most', 'desirable,', 'most', 'influential,', 'most', 'visited,', 'most', 'expensive,', '\ninnovative,', 'sustainable,', 'most', 'invest', 'friendly,', 'most', 'popul', 'for', 'work,', 'and', 'the', 'most', 'veget', 'friend', 'city', 'in', 'the', 'world.', '\nlondon', 'exert', 'a', 'consid', 'impact', 'upon', 'the', 'arts,', 'commerce,', 'education,', 'entertainment,', 'fashion,', 'finance,', 'healthcare,', 'media,', '\nprofessional', 'services,', 'research', 'and', 'development,', 'tour', 'and', 'transportation.', 'london', 'rank', '26', 'out', 'of', '300', 'maj', 'city', 'for', 'econom', '', '\nperformance.']


### Task2 

In [7]:
import nltk
from nltk.corpus import stopwords, reuters

def content_fraction(text):
    stopwords_en = stopwords.words("english")
    content = [w for w in text if w.lower() not in stopwords_en]
    return len(content) / len(text)

# nltk.download("reuters")
print(content_fraction(reuters.words()))

0.735240435097661


### Task3 

In [8]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
tagged_sentence = nltk.pos_tag(sentences[0])
print(tagged_sentence)

tree = nltk.ne_chunk(tagged_sentence)
tree.draw()

[('London', 'NNP'), ('is', 'VBZ'), ('considered', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('world', 'NN'), ("'s", 'VBZ'), ('most', 'RBS'), ('important', 'JJ'), ('global', 'JJ'), ('cities', 'NNS'), ('and', 'CC'), ('has', 'VBZ'), ('been', 'VBN'), ('termed', 'VBN'), ('the', 'DT'), ('world', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), (',', ','), ('most', 'JJS'), ('desirable', 'JJ'), (',', ','), ('most', 'JJS'), ('influential', 'JJ'), (',', ','), ('most', 'JJS'), ('visited', 'VBN'), (',', ','), ('most', 'RBS'), ('expensive', 'JJ'), (',', ','), ('innovative', 'JJ'), (',', ','), ('sustainable', 'JJ'), (',', ','), ('most', 'JJS'), ('investment', 'NN'), ('friendly', 'RB'), (',', ','), ('most', 'JJS'), ('popular', 'JJ'), ('for', 'IN'), ('work', 'NN'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('most', 'RBS'), ('vegetarian', 'JJ'), ('friendly', 'JJ'), ('city', 'NN'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.')]


### Task4 

In [9]:
import math
from collections import Counter

# TF-IDF
def compute_tfidf(corpus):
    def compute_tf(text):
        tf_text = Counter(text)
        for i in tf_text:
            tf_text[i] = tf_text[i] / float(len(tf_text))
        return tf_text

    def compute_idf(word, corpus):
        return math.log10(len(corpus) / sum([1.0 for i in corpus if word in i]))

    documents_list = []
    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)    
    return documents_list   

corpus = [
    ['pasta', 'la', 'vista', 'baby', 'la', 'vista'], 
    ['hasta', 'siempre', 'comandante', 'baby', 'la', 'siempre'], 
    ['siempre', 'comandante', 'baby', 'la', 'siempre']
]    

print(compute_tfidf(corpus))

[{'pasta': 0.11928031367991561, 'la': 0.0, 'vista': 0.23856062735983122, 'baby': 0.0}, {'hasta': 0.09542425094393249, 'siempre': 0.0704365036222725, 'comandante': 0.03521825181113625, 'baby': 0.0, 'la': 0.0}, {'siempre': 0.08804562952784062, 'comandante': 0.04402281476392031, 'baby': 0.0, 'la': 0.0}]


### Task5 

In [10]:
from nltk.stem.snowball import EnglishStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

english_stemmer = EnglishStemmer()

class StemmedCountVectorizer(TfidfVectorizer):
    def build_anyalyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

stem_vectorizer = StemmedCountVectorizer(stop_words='english')   

print(stem_vectorizer.get_stop_words())
print("***")
print(stem_vectorizer.fit_transform(simple_sentences))

frozenset({'mill', 'when', 'others', 'whether', 'again', 'whereas', 'yourself', 'co', 'for', 'throughout', 'even', 'seems', 'toward', 'once', 'both', 'herself', 'if', 'call', 'well', 'and', 'been', 'rather', 'here', 'too', 'etc', 'will', 'after', 'except', 'must', 'now', 'anyhow', 'might', 'thru', 'yours', 'back', 'over', 'what', 'only', 'these', 'do', 'whatever', 'anything', 'keep', 'name', 'find', 'them', 'forty', 'yet', 'go', 'however', 'hereby', 'sometime', 'within', 'us', 'see', 'found', 'several', 'become', 'amount', 'as', 'our', 'whereby', 'thence', 'whither', 'cannot', 'everywhere', 'you', 'himself', 'further', 'mine', 'much', 'between', 'neither', 'top', 'alone', 'beside', 'onto', 'about', 'take', 'along', 'therein', 'ourselves', 'hasnt', 'that', 'six', 'she', 'then', 'out', 'together', 'almost', 'ours', 'never', 'hers', 'or', 'ten', 'whole', 'no', 'am', 'whenever', 'wherein', 'eg', 'off', 'describe', 'was', 'via', 'beforehand', 're', 'somewhere', 'give', 'least', 'noone', 'so

### Task6

In [11]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "Steve Key, Mark Norkin and John Spenser are working in the Westminster Christian Academy"

from nltk.tag import PerceptronTagger
from nltk.data import find

PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = "file:" + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))

tagger = PerceptronTagger(load = False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag

print(ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (PERSON Steve/NNP)
  (ORGANIZATION Key/NNP)
  ,/,
  (PERSON Mark/NNP Norkin/NNP)
  and/CC
  (PERSON John/NNP Spenser/NNP)
  are/VBP
  working/VBG
  in/IN
  the/DT
  (ORGANIZATION Westminster/NNP Christian/NNP Academy/NNP))


### Task7

In [12]:
import re
IN = re.compile(r'.*\bin\b(?!\b.+ing)')

# nltk.download('ieer')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus = 'ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


### Task8

In [13]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)  
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)   # { named_entity: subtree[1] }
                current_chunk = []
        else:
            continue
    return continuous_chunk

text = "Mark Norkin and John Spenser are working in New York."

print(get_continuous_chunks(text))

['Mark Norkin', 'John Spenser', 'New York']


### Task9 

In [14]:
sentence = "Mark Norkin and John Spenser are working in the Westminster Christian Academy"

for sent in nltk.sent_tokenize(sentence):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ''.join(c[0] for c in chunk))

PERSON Mark
ORGANIZATION Norkin
PERSON JohnSpenser
ORGANIZATION WestminsterChristianAcademy
