In [1]:
# read orginal books to strings
f = open("Dick Philip. Ubik.txt","r")
ubik = f.read()
f = open("Dick Philip. Valis.txt","r")
valis = f.read()
f = open("Dick Philip. A Maze of Death.txt","r")
maze = f.read()
f = open("Dick Philip. Radio Free Albemuth.txt","r")
radio = f.read()
f = open("Dick Philip. The Three Stigmata of Palmer Eldritch.txt","r")
palmer = f.read()

In [6]:
# Create corpus from books
# Concatuate in one list

# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def books_to_corpus(books):
    corpus = list()
    for i in books:
        # Tokenize the article: tokens
        tokens = word_tokenize(i)
        # Convert the tokens into lowercase: lower_tokens
        lower_tokens = [t.lower() for t in tokens]
        # Retain alphabetic words: alpha_only
        alpha_only = [t for t in lower_tokens if t.isalpha()]
        # Remove all stop words: no_stops
        no_stops = [t for t in alpha_only if t not in stopwords.words('english')]
        # Instantiate the WordNetLemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()
        # Lemmatize all tokens into a new list: lemmatized
        lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
        corpus.append(lemmatized) 
    return corpus

titles = ['ubik', 'valis', 'maze', 'radio', 'palmer']
corpus_dick = books_to_corpus([ubik, valis, maze, radio, palmer])

dictofdick = dict(zip(titles, corpus_dick))

In [8]:
from collections import Counter

def bow(text, top):
    # Create the bag-of-words: bow
    bow = Counter(text)
    # Print most common tokens
    return bow.most_common(top)

print('Dick Philip. Ubik 50 top words \n')
print(bow(dictofdick['ubik'], 50), '\n')
print('Dick Philip. Valis 50 top words \n')
print(bow(dictofdick['valis'], 50), '\n')
print('Dick Philip. A Maze of Death 50 top words \n')
print(bow(dictofdick['maze'], 50), '\n')
print('Dick Philip. Radio Free Albemuth 50 top words \n')
print(bow(dictofdick['radio'], 50), '\n')
print('Dick Philip. The Three Stigmata of Palmer Eldritch 50 top words \n')
print(bow(dictofdick['palmer'], 50), '\n')

Dick Philip. Ubik 50 top words 

[('said', 1184), ('joe', 717), ('runciter', 467), ('one', 239), ('al', 204), ('know', 191), ('back', 181), ('u', 179), ('get', 163), ('time', 161), ('could', 160), ('like', 160), ('pat', 157), ('would', 156), ('thought', 149), ('chip', 132), ('asked', 130), ('way', 110), ('ubik', 106), ('right', 104), ('door', 104), ('room', 103), ('go', 103), ('see', 102), ('think', 96), ('denny', 96), ('made', 91), ('hand', 90), ('tell', 90), ('want', 90), ('moratorium', 90), ('going', 89), ('maybe', 88), ('ca', 86), ('make', 85), ('say', 84), ('jory', 84), ('new', 81), ('voice', 81), ('come', 81), ('got', 80), ('take', 80), ('ella', 77), ('face', 76), ('look', 75), ('two', 74), ('something', 74), ('hollis', 69), ('year', 64), ('girl', 64)] 

Dick Philip. Valis 50 top words 

[('said', 1109), ('fat', 979), ('kevin', 365), ('time', 307), ('one', 302), ('god', 288), ('could', 251), ('u', 243), ('would', 242), ('sherri', 194), ('know', 187), ('like', 157), ('say', 155), 

In [9]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(list(dictofdick.values()))

# Create a MmCorpus: corpus
dick_corpus = [dictionary.doc2bow(article) for article in list(dictofdick.values())]

# Print the first 10 word ids with their frequency counts from the first document, Ubik
print(dick_corpus[0][:10])

[(0, 2), (1, 1), (2, 8), (3, 1), (4, 1), (5, 1), (6, 16), (7, 28), (8, 1), (9, 6)]


In [10]:
from collections import defaultdict
import itertools

# Save the fifth document, Ubik
ubik_gensim = dick_corpus[0]

def print_words_freq(book, corpus, dictionary, top):
    # Sort the doc for frequency: bow_doc
    bow_doc = sorted(book, key=lambda w: w[1], reverse=True)
    # Print the top 5 words of the document alongside the count
#     for word_id, word_count in bow_doc[:5]:
#         print(dictionary.get(word_id), word_count)
    # Create the defaultdict: total_word_count
    total_word_count = defaultdict(int)
    for word_id, word_count in itertools.chain.from_iterable(corpus):
        total_word_count[word_id] += word_count
    # Create a sorted list from the defaultdict: sorted_word_count
    sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 
    # Print the top 5 words across all documents alongside the count
    for word_id, word_count in sorted_word_count[:top]:
        print(dictionary.get(word_id), word_count)
        
print('Dick Philip. Ubik 50 top frequent words using Gensim without tfidf \n')
print_words_freq(ubik_gensim, dick_corpus, dictionary, 10)

Dick Philip. Ubik 50 top frequent words using Gensim without tfidf 

said 5527
one 1292
could 1037
would 1030
time 1006
know 1003
fat 992
like 861
u 840
back 803


In [11]:
# Import TfidfModel
from gensim.models.tfidfmodel import TfidfModel 

def print_tfidf_freq(book, corpus, dictionary, top = 10, order = True):

    # Create a new TfidfModel using the corpus: tfidf
    tfidf = TfidfModel(corpus)

    # Calculate the tfidf weights of doc: tfidf_weights
    tfidf_weights = tfidf[book]

    # # Print the first five weights
    # print(tfidf_weights[:5])

    # Sort the weights from highest to lowest: sorted_tfidf_weights
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=order)
    
   
    # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights[:top]:
        print(dictionary.get(term_id), round(weight,4))

print('Dick Philip. Ubik 15 top frequent words with tfidf \n')
print_tfidf_freq(dick_corpus[0], dick_corpus, dictionary, 15)
print('\nDick Philip. Valis 15 top frequent words with tfidf\n')
print_tfidf_freq(dick_corpus[1], dick_corpus, dictionary, 15)
print('\nDick Philip. A Maze of Death 15 top frequent words with tfidf\n')
print_tfidf_freq(dick_corpus[2], dick_corpus, dictionary, 15)
print('\nDick Philip. Radio Free Albemuth 15 top frequent words with tfidf\n')
print_tfidf_freq(dick_corpus[3], dick_corpus, dictionary, 15)
print('\nDick Philip. The Three Stigmata of Palmer Eldritch 15 top frequent words with tfidf\n')
print_tfidf_freq(dick_corpus[4], dick_corpus, dictionary, 15)

Dick Philip. Ubik 15 top frequent words with tfidf 

runciter 0.683
joe 0.597
denny 0.1404
moratorium 0.1316
jory 0.1228
hollis 0.1009
wendy 0.0892
ubik 0.0883
inertials 0.079
hammond 0.076
vogelsang 0.0673
ella 0.0641
ashwood 0.0629
chip 0.0613
conley 0.06

Dick Philip. Valis 15 top frequent words with tfidf

kevin 0.7057
sherri 0.3751
gloria 0.1953
lampton 0.1933
mini 0.1817
sophia 0.1566
vali 0.1409
horselover 0.1353
david 0.1167
linda 0.1145
zebra 0.1025
eric 0.0936
maurice 0.0909
fremount 0.0889
plasmate 0.0812

Dick Philip. A Maze of Death 15 top frequent words with tfidf

morley 0.7029
belsnor 0.3422
babble 0.2787
seth 0.2653
thugg 0.175
frazer 0.1672
maggie 0.1641
russell 0.1349
susie 0.1223
walsh 0.1223
tallchief 0.1177
squib 0.1084
wade 0.0929
noser 0.0774
betty 0.0681

Dick Philip. Radio Free Albemuth 15 top frequent words with tfidf

nicholas 0.6086
fremont 0.3354
rachel 0.3279
vivian 0.2904
aramchek 0.2603
sadassa 0.1981
vali 0.1539
phil 0.1411
kaplan 0.1126
progressive 0.

In [12]:
print('Dick Philip. Ubik 15 top infrequent words with tdidf \n')
print_tfidf_freq(dick_corpus[0], dick_corpus, dictionary, 15, False)
print('\nDick Philip. Valis 15 top infrequent words with tdidf\n')
print_tfidf_freq(dick_corpus[1], dick_corpus, dictionary, 15, False)
print('\nDick Philip. A Maze of Death 15 top infrequent words with tdidf\n')
print_tfidf_freq(dick_corpus[2], dick_corpus, dictionary, 15, False)
print('\nDick Philip. Radio Free Albemuth 15 top infrequent words with tdidf\n')
print_tfidf_freq(dick_corpus[3], dick_corpus, dictionary, 15, False)
print('\nDick Philip. The Three Stigmata of Palmer Eldritch 15 top infrequent words with tdidf\n')
print_tfidf_freq(dick_corpus[4], dick_corpus, dictionary, 15, False)

Dick Philip. Ubik 15 top infrequent words with tdidf 

abstract 0.0002
admittedly 0.0002
affair 0.0002
affect 0.0002
aimed 0.0002
alertly 0.0002
allow 0.0002
america 0.0002
amuse 0.0002
amused 0.0002
amusement 0.0002
angrily 0.0002
animosity 0.0002
apartment 0.0002
appeal 0.0002

Dick Philip. Valis 15 top infrequent words with tdidf

accustomed 0.0003
admittedly 0.0003
afford 0.0003
al 0.0003
alertly 0.0003
amuse 0.0003
amusement 0.0003
animosity 0.0003
appeal 0.0003
arrest 0.0003
aside 0.0003
assassin 0.0003
attorney 0.0003
bait 0.0003
beamed 0.0003

Dick Philip. A Maze of Death 15 top infrequent words with tdidf

abstract 0.0002
aching 0.0002
admire 0.0002
aid 0.0002
aimed 0.0002
alertly 0.0002
america 0.0002
amuse 0.0002
amused 0.0002
appalled 0.0002
artifact 0.0002
ascended 0.0002
augmented 0.0002
avoid 0.0002
awaken 0.0002

Dick Philip. Radio Free Albemuth 15 top infrequent words with tdidf

accustomed 0.0003
aching 0.0003
adequate 0.0003
admire 0.0003
afford 0.0003
al 0.0003
anim

In [57]:
from nltk import pos_tag

# import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

# Named entity recognition
def ner(book, part):

    # Tokenize the article into sentences: sentences
    sentences = sent_tokenize(book)

    # Tokenize each sentence into words: token_sentences
    token_sentences = [word_tokenize(sent) for sent in sentences]

    # Tag each tokenized sentence into parts of speech: pos_sentences
    pos_sentences = [pos_tag(sent) for sent in token_sentences] 

    # Create the named entity chunks: chunked_sentences
    chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary=True)
    
    result = dict()
    # Test for stems of the tree with 'part' tags
    for sent in chunked_sentences:
        for chunk in sent:
            if hasattr(chunk, "label") and chunk.label() == part:
                result[str(chunk)] = result.get(str(chunk), 0) + 1
    return result            


In [91]:
ubik_ner_dict = ner(ubik, 'NE')
ubik_ner_list = [(w, ubik_ner_dict[w]) for w in sorted(ubik_ner_dict, key=ubik_ner_dict.get, reverse=True)]
print('Top 15 proper nouns from Ubik: \n')
ubik_ner_list[:15]

Top 15 proper nouns from Ubik: 



[('(NE Joe/NNP)', 617),
 ('(NE Runciter/NNP)', 205),
 ('(NE Al/NNP)', 115),
 ('(NE Pat/NNP)', 101),
 ('(NE Joe/NNP Chip/NNP)', 72),
 ('(NE Ubik/NNP)', 67),
 ('(NE Denny/NNP)', 52),
 ('(NE Ella/NNP)', 50),
 ('(NE Jory/NNP)', 45),
 ('(NE Hollis/NNP)', 40),
 ('(NE Glen/NNP Runciter/NNP)', 38),
 ('(NE New/NNP York/NNP)', 38),
 ('(NE Wendy/NNP)', 36),
 ('(NE Mr./NNP Runciter/NNP)', 35),
 ('(NE Don/NNP Denny/NNP)', 35)]

In [84]:
valis_ner_dict = ner(valis, 'NE')
valis_ner_list = [(w, valis_ner_dict[w]) for w in sorted(valis_ner_dict, key=valis_ner_dict.get, reverse=True)]
print('Top 15 proper nouns from Valis: \n')
valis_ner_list[:15]

Top 15 proper nouns from Valis: 



[('(NE Fat/NNP)', 816),
 ('(NE Kevin/NNP)', 343),
 ('(NE Sherri/NNP)', 173),
 ('(NE David/NNP)', 106),
 ('(NE Gloria/NNP)', 91),
 ('(NE Mini/NNP)', 75),
 ('(NE VALIS/NNP)', 69),
 ('(NE Sophia/NNP)', 63),
 ('(NE Linda/NNP)', 59),
 ('(NE God/NNP)', 56),
 ('(NE Horselover/NNP Fat/NNP)', 56),
 ('(NE Empire/NNP)', 55),
 ('(NE Savior/NNP)', 52),
 ('(NE Christ/NNP)', 47),
 ('(NE Zebra/NNP)', 40)]

In [85]:
radio_ner_dict = ner(radio, 'NE')
radio_ner_list = [(w, radio_ner_dict[w]) for w in sorted(radio_ner_dict, key=radio_ner_dict.get, reverse=True)]
print('Top 15 proper nouns from A Maze of Death: \n')
radio_ner_list[:15]

Top 15 proper nouns from A Maze of Death: 



[('(NE Nicholas/NNP)', 349),
 ('(NE Rachel/NNP)', 126),
 ('(NE Sadassa/NNP)', 97),
 ('(NE Berkeley/NNP)', 94),
 ('(NE Phil/NNP)', 84),
 ('(NE Aramchek/NNP)', 66),
 ('(NE Valis/NNP)', 66),
 ('(NE Ferris/NNP Fremont/NNP)', 57),
 ('(NE Fremont/NNP)', 51),
 ('(NE Vivian/NNP)', 44),
 ('(NE Johnny/NNP)', 37),
 ('(NE Party/NNP)', 31),
 ('(NE Orange/NNP County/NNP)', 31),
 ('(NE Leon/NNP)', 30),
 ('(NE FAP/NNP)', 29)]

In [88]:
maze_ner_dict = ner(maze, 'NE')
maze_ner_list = [(w, maze_ner_dict[w]) for w in sorted(maze_ner_dict, key=maze_ner_dict.get, reverse=True)]
print('Top 15 proper nouns from Radio Free Albemuth: \n')
maze_ner_list[:15]

Top 15 proper nouns from Radio Free Albemuth: 



[('(NE Seth/NNP Morley/NNP)', 201),
 ('(NE Russell/NNP)', 118),
 ('(NE Babble/NNP)', 117),
 ('(NE Belsnor/NNP)', 95),
 ('(NE Morley/NNP)', 74),
 ('(NE Frazer/NNP)', 63),
 ('(NE Maggie/NNP Walsh/NNP)', 55),
 ('(NE Wade/NNP Frazer/NNP)', 42),
 ('(NE Form/NNP Destroyer/NNP)', 37),
 ('(NE Babble/JJ)', 37),
 ('(NE Thugg/NNP)', 31),
 ('(NE Glen/NNP Belsnor/NNP)', 30),
 ('(NE Ignatz/NNP Thugg/NNP)', 29),
 ('(NE Susie/NNP)', 28),
 ('(NE Maggie/NNP)', 28)]

In [90]:
palmer_ner_dict = ner(palmer, 'NE')
palmer_ner_list = [(w, palmer_ner_dict[w]) for w in sorted(palmer_ner_dict, key=palmer_ner_dict.get, reverse=True)]
print('Top 15 proper nouns from The Three Stigmata of Palmer Eldritch: \n')
palmer_ner_list[:15]

Top 15 proper nouns from The Three Stigmata of Palmer Eldritch: 



[('(NE Barney/NNP)', 299),
 ('(NE Leo/NNP)', 166),
 ('(NE Eldritch/NNP)', 143),
 ('(NE Palmer/NNP Eldritch/NNP)', 131),
 ('(NE Anne/NNP)', 60),
 ('(NE Emily/NNP)', 58),
 ('(NE Mars/NNP)', 56),
 ('(NE Mayerson/NNP)', 48),
 ('(NE Barney/NNP Mayerson/NNP)', 47),
 ('(NE Leo/NNP Bulero/NNP)', 44),
 ('(NE Mr./NNP Mayerson/NNP)', 37),
 ('(NE Norm/NNP Schein/NNP)', 35),
 ('(NE Perky/NNP Pat/NNP)', 33),
 ('(NE Palmer/NNP)', 33),
 ('(NE Hnatt/NNP)', 32)]

In [123]:
# Common proper names between books
# Comment out to check different combinations

common_names = set(array(ubik_ner_list)[:,0]).intersection(
                                                            set(array(valis_ner_list)[:,0]),
                                                            set(array(radio_ner_list)[:,0]),
#                                                             set(array(maze_ner_list)[:,0]),
#                                                             set(array(palmer_ner_list)[:,0])
                                                          )
print(common_names)

{'(NE God/NNP)', '(NE Jesus/NNP)', '(NE None/NN)', '(NE Soviet/NNP Union/NNP)', '(NE Half/NN)', '(NE Nobody/NN)', '(NE Ferris/NNP)', '(NE Anyhow/NNP)', '(NE Okay/NNP)', '(NE German/JJ)', '(NE Christ/NN)', '(NE America/NNP)', '(NE United/NNP States/NNPS)', '(NE American/JJ)', '(NE THE/NNP)', '(NE Japan/NNP)', '(NE Germany/NNP)', '(NE Earth/NNP)', '(NE French/JJ)', '(NE San/NNP Francisco/NNP)', '(NE China/NNP)', '(NE New/NNP York/NNP)', '(NE Nothing/NN)', '(NE Christ/NNP)', '(NE Hebrew/NNP)', '(NE Someone/NN)', '(NE Please/NNP)', '(NE Philip/NNP)', '(NE No/DT)', '(NE Cleveland/NNP)', '(NE Latin/NNP)', '(NE Silence/NN)', '(NE Sorry/NNP)'}


In [171]:
lists_names = [ubik_ner_list, valis_ner_list, radio_ner_list, maze_ner_list, palmer_ner_list]
titles = ['ubik', 'valis', 'maze', 'radio', 'palmer']

score = list()
name = list()

for i, j in list(itertools.combinations(lists_names,2)):
    score.append((len(set(array(i)[:,0]).intersection(set(array(j)[:,0])))))       
    
for i, j in list(itertools.combinations(titles,2)):
    name.append(i + ' + ' + j)      

books_similiarity = zip(name,score)
print(sorted(tuple(books_similiarity), key=lambda tup: tup[1], reverse=True))

[('valis + maze', 148), ('ubik + palmer', 68), ('ubik + maze', 66), ('maze + palmer', 65), ('ubik + valis', 62), ('valis + palmer', 61), ('valis + radio', 56), ('radio + palmer', 51), ('ubik + radio', 49), ('maze + radio', 49)]
