In [1]:
import nltk
import re
import string
from pprint import pprint
corpus = ["The brown fox wasn't that quick and he couldn't win the race",
          "Hey that's a great deal! I just bought a phone for $199",
          "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]

In [2]:
#tokenizing
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens

In [6]:
token_list = [tokenize_text(text) for text in corpus]

In [12]:
#removing special characters
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens]) 
    return filtered_tokens

In [13]:
filtered_list_1 =  [filter(None,[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens]) 
                    for sentence_tokens in token_list]
print filtered_list_1

[[['The', 'brown', 'fox', 'was', 'nt', 'that', 'quick', 'and', 'he', 'could', 'nt', 'win', 'the', 'race']], [['Hey', 'that', 's', 'a', 'great', 'deal'], ['I', 'just', 'bought', 'a', 'phone', 'for', '199']], [['You', 'll', 'learn', 'a', 'lot', 'in', 'the', 'book'], ['Python', 'is', 'an', 'amazing', 'language']]]


In [15]:
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    return filtered_sentence

In [16]:
filtered_list_2 = [remove_characters_before_tokenization(sentence)
                     for sentence in corpus]

print filtered_list_2

['The brown fox wasnt that quick and he couldnt win the race', 'Hey thats a great deal I just bought a phone for 199', 'Youll learn a lot in the book Python is an amazing language']


In [17]:
cleaned_corpus = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) for sentence in corpus]

print cleaned_corpus

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language!"]


In [18]:
from contractions import CONTRACTION_MAP

In [20]:
def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
            flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

In [21]:
expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP)
                  for sentence in cleaned_corpus]
print expanded_corpus

['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language!']


In [22]:
#removing stopwords
def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

In [23]:
expanded_corpus_tokens = [tokenize_text(text) for text in expanded_corpus]
filtered_list_3 = [[remove_stopwords(tokens) for tokens in sentence_tokens]
                  for sentence_tokens in expanded_corpus_tokens]

In [24]:
print filtered_list_3

[[['The', 'brown', 'fox', 'quick', 'could', 'win', 'race']], [['Hey', 'great', 'deal', '!'], ['I', 'bought', 'phone', '199']], [['You', 'learn', 'lot', 'book', '.'], ['Python', 'amazing', 'language', '!']]]


In [29]:
old_word = 'finalllyyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1

while True:
    #remove one repeated character
    new_word = repeat_pattern.sub(match_substitution, old_word)
    
    if new_word != old_word:
        print 'Step: {} Word: {}'.format(step, new_word)
        step += 1 #update step
        #update old word to last substituted state
        old_word = new_word
        continue
    else:
        print "Final word:", new_word
        break

Step: 1 Word: finalllyyy
Step: 2 Word: finalllyy
Step: 3 Word: finallly
Step: 4 Word: finally
Step: 5 Word: finaly
Final word: finaly


In [30]:
from nltk.corpus import wordnet
old_word = 'finalllyyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1

while True:
    #check for semantically correct word
    if wordnet.synsets(old_word):
        print "Final correct word:", old_word
        break
    #remove one repeated character
    new_word = repeat_pattern.sub(match_substitution, old_word)
    
    if new_word != old_word:
        print 'Step: {} Word: {}'.format(step, new_word)
        step += 1 #update step
        #update old word to last substituted state
        old_word = new_word
        continue
    else:
        print "Final word:", new_word
        break

Step: 1 Word: finalllyyy
Step: 2 Word: finalllyy
Step: 3 Word: finallly
Step: 4 Word: finally
Final correct word: finally


In [31]:
def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

In [32]:
sample_sentence = 'My schooool is realllllyyy amaaazingggg'
sample_sentence_tokens = tokenize_text(sample_sentence)[0]
print sample_sentence_tokens

['My', 'schooool', 'is', 'realllllyyy', 'amaaazingggg']


In [33]:
print remove_repeated_characters(sample_sentence_tokens)

['My', 'school', 'is', 'really', 'amazing']
