In [1]:
import numpy as np
import nltk
import re
import unicodedata 
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer


In [16]:

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    #words = remove_stopwords(words)
    return words


In [17]:
def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas


In [28]:
def processtext(textfile):
    sentences =[]
    for line in textfile:
        sentences+= line.split('.')
        n_sents = []
    for sentence in sentences:
        temp= normalize(sentence.split())
        n_sents.append(temp)
    return n_sents

In [18]:
with open('./corpus-20090418/orig_taske.txt') as f:
    textfile = f.readlines()

In [19]:
sentences =[]
for line in textfile:
    sentences+= line.split('.')

In [34]:
srctxt = processtext(textfile)

In [35]:
vocab = dict()
curr = 0

In [36]:
for s in srctxt:
    for w in s:
        if w not in vocab:
            vocab[w] = curr
            curr+=1

In [78]:
sentdict = dict()

for line in srctxt:
    num = 0
    for w in line:
        num += vocab[w]
    key = str(len(line))+ '_' + str(num)
    if key in sentdict:
        sentdict[key].append(line)
    else:
        sentdict[key] = [line]

In [79]:
len(vocab)

227

In [91]:
with open('./corpus-20090418/g4pE_taske.txt') as f:
    textfiletest = f.readlines()

In [92]:
test0 = processtext(textfiletest)

In [86]:
cpycnt =0
totcnt = 0
for line in test0:
    num =0
    for word in test0:
        num += vocab[w]
        key = str(len(line))+ '_' + str(num)
    if key in sentdict:
        cpycnt+=len(line)
    totcnt+=len(line)
cpycnt/totcnt

0.0