In [14]:
import spacy
import nltk
from nltk.corpus import wordnet 
import os 
import collections
from unidecode import unidecode

Loading Corpus:

In [15]:
nlp = spacy.load('en_core_web_sm')

In [16]:
def handle_input_from_file(filename):
    if os.path.isfile(filename) :
        result = []
        with open(filename) as f:
            lines = (f.readlines())
            f.close()
        for line in lines:
            if(line[0]!='#' and line!="" and line != " " and line != "\n"):
                result.append(line.strip())
        return result, False
    raise Exception("Something went wrong trying to read the file " + filename + ". The program will now exit.")


Create Paragraphs:

In [17]:
paragraphs = handle_input_from_file("./antioedipus.txt")


In [18]:
named_entity_blacklist = []

Tokenizer

In [19]:
docs = []
for paragraph in paragraphs[0]: 
    docs.append(nlp(unidecode(paragraph)))
    docs.append('\n')

words = []
named_entities = []
for doc in docs:
    if(doc != '\n' and len(doc.ents)>0):
        for ent in doc.ents:
            for word in ent:
                word.ent_id = 1
    for word in doc: 
        if doc != '\n' and word.ent_id_!='' and word.text.lower() not in named_entity_blacklist:
            # Ensures that named entities are added to an exception list.
            # Not entirely sure this should be done, should be determined 
            # Do I really want the end result to just be named entities 
            # and blanks? 
            named_entities.append(word.text) 
            words.append(word)
        else: 
            words.append(word)


In [20]:
sorted(collections.Counter(named_entities).items(), reverse=True, key=lambda x:x[1])

[('one', 3), ('Lenz', 3), ('Schreber', 2), ('Buchner', 1)]

In [21]:
def synonym(inputdoc, used_words):
    input = inputdoc.text # Collects text from tokens
    for syn in wordnet.synsets(input): # For each synonym for the text...
        if syn.pos().lower() == inputdoc.pos_.lower()[0]:   # Makes sure that the synonym 
                                                            # is of same meaning by comparing 
                                                            # part-of-speech tags
            for i in syn.lemmas():  
                word = i.name().replace('_', ' ') # Extracts the actual text of the synonym
                if word.lower()!=input.lower() and (
                        '*'+word.lower()+'*' not in used_words) and (
                        '*'+word+'*' not in used_words): # quite inefficient comparisons
                    return '*'+ word + '*'  # The "*"s marks replaced words. Not sure 
                                            # this should be kept in the final version.

    return '_' # If no viable synonym is found, a blank is returned. 

In [22]:
stops =     [',', '.', '!', '-', 
            '\"', '\'', ')', '(', 
            ':', ';']

In [23]:
stopwords =     ['a', 'me', 'the', 'you', 
                'i', 'and', 'or', 'is', 
                'of', 'to', 'in', 'that', 
                'not', 'can' , 'it', 'from'
                'in', 'be', 'no', 'about',
                'as', 'there', 'was', 'this',
                'are', 'malevich', 'duchamp',
                '\'s', 'we']    # This is based on Mladen Stilinovic and Vlado Marteks
                                # "Work: Praise of Laziness", which claims that Artists
                                # should embrace Laziness because otherwise they can't be
                                # artists but instead become producers. 
                                # TODO: I think it should be organically generated based on 
                                # each specific text, or maybe they shouldn't be protected 
                                # from removal by the algorithm at all.

In [24]:
stop_and_words = stops + stopwords

Main Algorithm:

In [25]:
def uniqueizer(words):
    result = []
    filtered = []
    for word in words:
        if type(word) ==str:
            result.append(word)
            continue
        word_text = word.text
        if (word_text.lower() not in stop_and_words) and (word_text in result or word_text.lower() in result and word_text.lower() not in named_entities): 
            new_word = synonym(word, result)
            if new_word == None:
                new_word = '_'
            result.append(new_word)
            filtered.append(word_text.lower())
        else:
            result.append(word_text)
    return result, filtered


In [26]:
prel_result, filtered_out = uniqueizer(words)

In [27]:
sorted(collections.Counter(filtered_out).items(), reverse=True, key=lambda x:x[1])

[('machines', 12),
 ('machine', 12),
 ('with', 7),
 ('other', 6),
 ('all', 6),
 ('his', 6),
 ('nature', 5),
 ('at', 4),
 ('produces', 3),
 ('an', 3),
 ('himself', 3),
 ('he', 3),
 ('have', 3),
 ('everywhere', 2),
 ('something', 2),
 ('for', 2),
 ('lenz', 2),
 ('relationship', 2),
 ('on', 2),
 ('without', 2),
 ('father', 2),
 ('every', 2),
 ('into', 2),
 ('one', 2),
 ('process', 2),
 ('times', 1),
 ('ones', 1),
 ('mouth', 1),
 ('organ', 1),
 ('energy', 1),
 ('judge', 1),
 ('schreber', 1),
 ('by', 1),
 ('walk', 1),
 ('stroll', 1),
 ('outdoors', 1),
 ('gods', 1),
 ('mother', 1),
 ('what', 1),
 ('than', 1),
 ('?', 1),
 ('body', 1),
 ('has', 1),
 ('time', 1),
 ('before', 1),
 ('dichotomy', 1),
 ('does', 1),
 ('such', 1),
 ('man', 1),
 ('schizophrenic', 1),
 ('life', 1),
 ('self', 1),
 ('outside', 1),
 ('any', 1)]

In [28]:
def flatten(words):
    result = ' '
    for word in words:
        result += word+' '
    return result

In [29]:
# When recombining the tokens, this method helps ensure that there are 
# no spaces before stopsigns ".", ",", "!" etc. This would be more efficient
# if it was combined with the method above.
def remove_space_before_stop(text):
    for char in stops:
        text = text.replace(' '+char, char)
    text = text.replace('( ', '(')
    text = text.replace('- ', '-')
    return text

In [30]:
flattened = flatten(prel_result)
result = remove_space_before_stop(flattened)

In [31]:
def write_to_file(text):
    f = open("output.txt", "w")

    f.write(text)

    f.close()


In [32]:
write_to_file(result)

In [33]:
result

' It is at work everywhere, \n functioning smoothly _ times, \n _ other *multiplication* in fits and starts. \n It breathes, it heats, it eats. \n It shits and fucks. \n What a mistake to have ever said the i d. \n _ it is machines aEUR" real ones, not figurative *one*: *machine* driving _ *simple machine*, *political machine* being driven by _ *car*, with all the necessary couplings and connections. \n An organ-machine is plugged into an energy-source-*auto*: the one produces a flow that the _ inter-rupts. The breast is a *automobile* that *produce* milk, and the mouth i *motorcar* coupled to it. The *oral cavity* of the anorexic wavers between several functions: its possessor is uncertain as to whether it is _ eating-_, _ anal _, a talking-_, or a breathing _(asthma attacks). Hence we are _ handymen: each _ his little _. For every *electric organ*-_, _ *free energy*-_: _ the time, flows and interruptions. \n Judge Schreber * has sunbeams in _ ass. A solar anus. And rest assured that 