In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Corpus creation 

-Requirements
    oTokenization 
        Translate sentences into meaningful blocks
    oBuild Bag-of-words
        Vectorized dictionary
-Algorithms to explore
    oSkip-gram
    oWord-2-vec
    oTf-idf
    oHierachical Softmax
    oNegative Sampling
    oNaïve Softmax
    oMorphological segmentation
    oPart-of-speech tagging
    oSentence breaking
        -Topic detection 
    oRequirements
        Stop Words
        Regular expression tokenizer
        Snowball stemmer
    oAlgorithms to consider
        Latent Dirichlet Allocation (LDA) topic
        Skip Gram (context prediction) 
        Multinominal Naïve Bayes


In [None]:
import pandas as pd
from nltk.tokenize import sent_tokenize

In [None]:
df = pd.read_excel(".\word_tree\HansardFlatDatasetUpdated.xlsx")

In [None]:
df.head()

In [None]:
dfTemp = df[['qTextID', 'talkerTranscript']]
dfTemp = dfTemp.dropna()

In [None]:
import nltk
nltk.download('punkt')

## Sentence Tokenize

In [None]:
dfTemp = df[['qtalkKind', 'textID', 'talkerRole', 'talkerName', 'talkerTranscript']]

sentTokens = []

for i in dfTemp['talkerTranscript']:
    try:
        a = sent_tokenize(i)
        sentTokens.append(a)
        print(a)
    except:
        print("error")
        sentTokens.append("blank")


In [None]:
df.columns

## Word Tokenize - Treebank tokenization

In [None]:
from nltk.tokenize import word_tokenize

wordTokens = []
for i in sentTokens:
    a = word_tokenize(i[0])
    print(a)
    wordTokens.append(a)

## Filter Stopwords in a tokenized sentence

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
english_stops = set(stopwords.words('english'))

In [None]:
listOfWordsWithoutStops = []
for listOfWords in wordTokens:
        #print(listOfWords)
        listOfWordsWithoutStopsInstance = [word for word in listOfWords if word not in english_stops]
        listOfWordsWithoutStops.append(listOfWordsWithoutStopsInstance)

In [None]:
listOfWordsWithoutStops

## sysnets and hypernyms

In [None]:
from nltk.corpus import wordnet
syn = wordnet.synsets('police')[0]

In [None]:
syn.name()

In [None]:
syn.definition()

In [None]:
wordnet.synsets('police')[0].examples()

In [None]:
syn.hypernyms()

In [None]:
syn.hypernym_paths()

### Part of speech Tag
Noun = n
Adjective = a
Adverb = r
Verb = v

In [None]:
syn.pos()

## Lemmas and synonyms
#### A lemma (in linguistics), is the canonical form or morphological form of a word.

In [None]:
lemmas = syn.lemmas()
loops = len(lemmas)

In [None]:
for i in range(loops):
    print(lemmas[i].name())

In [None]:
lemmas[0].synset() == lemmas[1].synset()

In [None]:
synonyms = []
for syn in wordnet.synsets('politician'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
len(synonyms)

In [None]:
synonyms

## Antonyms

In [None]:
gn2 = wordnet.synset('good.n.02')
gn2.definition()

In [None]:
evil = gn2.lemmas()[0].antonyms()[0]

In [None]:
evil.name
evil.synset().definition()

In [None]:
ga1 = wordnet.synset('good.a.01')
ga1.definition()

In [None]:
bad = ga1.lemmas()[0].antonyms()[0]
bad.name()

In [None]:
bad.synset().definition()

## Calculating Similarity - Wu-Palmer Similarity

In [None]:
from nltk.corpus import wordnet


In [None]:
cb = wordnet.synset('police.n.01')

In [None]:
ib = wordnet.synset('constabulary.n.01')

In [None]:
cb.wup_similarity(ib)

In [None]:
cb.shortest_path_distance(ref)

In [None]:
ib.shortest_path_distance(ref)

In [None]:
cb.shortest_path_distance(ib)

## Discover word collocations - (need to run pre stopwords) - also need to pick a larger text blob to get themes for the day

In [None]:
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
nltk.download('webtext')

In [None]:
words = [w.lower() for w in webtext.words('grail.txt')]

In [None]:
# pre stop words
listOfWordsWithoutStopsBigramCollocationFinder = []
for listOfWords in sentTokens:
        #print(listOfWords)
        listOfWordsWithoutStopsInstance = [word for word in listOfWords if word not in english_stops]
        bcf = BigramCollocationFinder.from_words(listOfWordsWithoutStopsInstance)
        listOfWordsWithoutStopsInstance =  bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
        
        
        listOfWordsWithoutStopsBigramCollocationFinder.append(listOfWordsWithoutStopsInstance)

listOfWordsWithoutStopsBigramCollocationFinder

In [None]:
# post stop words
listOfWordsWithoutStopsBigramCollocationFinder = []
for listOfWords in listOfWordsWithoutStops:
        #print(listOfWords)
        listOfWordsWithoutStopsInstance = [word for word in listOfWords if word not in english_stops]
        bcf = BigramCollocationFinder.from_words(listOfWordsWithoutStopsInstance)
        listOfWordsWithoutStopsInstance =  bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
        listOfWordsWithoutStopsBigramCollocationFinder.append(listOfWordsWithoutStopsInstance)

listOfWordsWithoutStopsBigramCollocationFinder

In [None]:
from nltk.corpus import stopwords

stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)

## TrigramCollocationFinder

In [None]:
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

allWords = []
loops = len(listOfWordsWithoutStops)

for i in range(loops):
    words = [w.lower() for w in listOfWordsWithoutStops[i]]
    allWords.append(words[0])


In [None]:
allWords

In [None]:
tcf = TrigramCollocationFinder.from_words(allWords)
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(3)
tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
    

### Stemming words

### Lemmatizing words with WordNet

### Replacing words matching regular expressions

### Removing repeating characters

### Spelling correction with Enchant

### Replacing synonyms

### Replacing negations with antonyms

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('police')

In [None]:
stemmer.stem('politician')

## Lancaster stemming algorithm

In [None]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('running')

In [None]:
stemmer.stem('runner')

## RegexpStemmer

In [None]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
stemmer.stem('running')

In [None]:
stemmer.stem('runner')

In [None]:
stemmer.stem('Adelaide')

## SnowballStemmer

In [None]:
from nltk.stem import SnowballStemmer

spanish_stemmer = SnowballStemmer('english')
spanish_stemmer.stem('howdy')

## Lemmatizing words with WordNet

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('running')

In [None]:
lemmatizer.lemmatize('running', pos='v')

In [None]:
lemmatizer.lemmatize('runners')

## Important example for difference between stemming and lemmatization 

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('believes')

In [None]:
lemmatizer.lemmatize('believes')

## Combining stemming with lemmatization

In [None]:
stemmer.stem('buses')

In [None]:
lemmatizer.lemmatize('buses')

In [None]:
stemmer.stem('bus')

Stemming saves one character, lemmatization saves two characters, and
stemming the lemma saves a total of three characters out of five characters. That is nearly
a 60% compression rate! This level of word compression over many thousands of words,
while unlikely to always produce such high gains, can still make a huge difference.

## Spell correct 

In [None]:
# import enchant
from nltk.metrics import edit_distance
# class SpellingReplacer(object):
#     def __init__(self, dict_name='en', max_dist=2):
#         self.spell_dict = enchant.Dict(dict_name)
#         self.max_dist = max_dist
    
#     def replace(self, word):
#         if self.spell_dict.check(word):
#             return word
#         suggestions = self.spell_dict.suggest(word)
    
#         if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
#             return suggestions[0]
#         else:
#             return word

In [None]:
# from replacers import SpellingReplacer
# replacer = SpellingReplacer()
# replacer.replace('word')

## Replacing synonyms

In [None]:
class WordReplacer(object):
    def __init__(self, word_map):
        self.word_map = word_map
    def replace(self, word):
        return self.word_map.get(word, word)

In [None]:
replacer = WordReplacer({'sapol': 'police'})
replacer.replace('sapol')

# csv replacement


In [None]:
# requires a csv with 2 columns 
# word     |    synonym 
# police   |     sapol

In [None]:
import csv
class CsvWordReplacer(WordReplacer):
    def __init__(self, fname):
        word_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            word_map[word] = syn
        super(CsvWordReplacer, self).__init__(word_map)

# replace words with  Antonym 

In [None]:
from nltk.corpus import wordnet
class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words

In [None]:
replacer = AntonymReplacer()
replacer.replace('nice')

In [None]:
replacer.replace('uglify')

In [None]:
sent = ["lets", 'not', 'uglify', 'america', 'great', 'again']

In [None]:
replacer.replace_negations(sent)

In [None]:
class AntonymWordReplacer(WordReplacer, AntonymReplacer):
    pass

In [None]:
replacer = AntonymWordReplacer({'evil': 'good'})
replacer.replace_negations(['good', 'is', 'not', 'evil'])

# Creating the Corpora

In [None]:
import os, os.path
path = os.path.expanduser('~/nltk_data')

if not os.path.exists(path):
    os.mkdir(path)

os.path.exists(path)

import nltk.data
path in nltk.data.path

In [None]:
from nltk.corpus.reader import WordListCorpusReader

In [None]:
reader = WordListCorpusReader(".",wordTokens)

In [None]:
a = list(reader.fileids())

In [None]:
with open('words.txt', 'w') as f:
    for item in a:
        print(item)
        f.write("%s\n" % item)
    f.close

In [None]:
from nltk.tokenize import line_tokenize
# line_tokenize(reader)

In [None]:
import nltk
nltk.download('names')
nltk.download('words')

In [None]:
from nltk.corpus import names
names.fileids()

In [None]:
from nltk.corpus import words
words.fileids()

In [None]:
from nltk.corpus.reader import TaggedCorpusReader

In [None]:
reader = TaggedCorpusReader('.', r'.*\.txt')

In [None]:
reader

In [None]:
reader.tagged_words()

In [None]:
reader.sents()

In [None]:
reader.tagged_sents()

In [None]:
# reader.paras()

In [None]:
# reader.tagged_paras()

In [None]:
from nltk.tokenize import SpaceTokenizer
reader = TaggedCorpusReader('.', r'.*\.txt', word_tokenizer=SpaceTokenizer())
reader.words()

In [None]:
from nltk.tokenize import LineTokenizer
reader = TaggedCorpusReader('.', r'.*\.txt', sent_tokenizer=LineTokenizer())
reader.sents()

# universal tagset

In [None]:
import nltk
nltk.download('universal_tagset')

In [None]:
reader = TaggedCorpusReader('.', r'.*\.txt', tagset='en-brown')
kk = reader.tagged_words(tagset='universal')