In [190]:
import gensim.downloader as api
import pandas as pd
import os
import nltk

from gensim.matutils import softcossim
from gensim.test.utils import common_texts
from gensim.corpora import Dictionary
from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from nltk.corpus import stopwords
from textblob import TextBlob

In [4]:
model = api.load('glove-wiki-gigaword-300')

In [29]:
stoplist = stopwords.words('english')

In [176]:
def default_clean(text):
    '''
    Removes default bad characters
    '''
    bad_chars = set(["@", "+", '/', "'", '"', '\\','(',')', '', '\\n', '', '?', '#', ',','.', '[',']', '%', '$', '&', ';', '!', ';', ':',"*", "_", "=", "}", "{"])
    for char in bad_chars:
        text = text.replace(char, "")
    text = text.lower().split()
    text = [w for w in text if w not in stoplist]
    text = [w for w in text if w.isalpha()]
#     text = [st.stem(w) for w in text]
#     text = [str(TextBlob(w).correct()) for w in text]
    return text

In [177]:
def get_doc_text(filename):
    ''' Given a filename, return a string list representation of 
        its contents. '''

    contents = []
    with open(filename, 'r') as f:
        contents = f.readlines()
    f.close()
    text_str = ' '.join(contents)
    return default_clean(text_str)

In [235]:
hero_texts = []
hero_names = []
heroine_texts = []
heroine_names = []
hero_folder = os.getcwd() + '/data/topic_modeling/archetypes/hero'
heroine_folder = os.getcwd() + '/data/topic_modeling/archetypes/heroine'

for f in os.listdir(hero_folder): # add all archetype description texts
    file = hero_folder + '/' + f
    hero_texts.append(get_doc_text(file))
    hero_names.append(f.replace('.txt',''))
    
for f in os.listdir(heroine_folder): # add all archetype description texts
    file = heroine_folder + '/' + f
    heroine_texts.append(get_doc_text(file))
    heroine_names.append(f.replace('.txt',''))

In [236]:
topic_folder = os.getcwd() + '/data/topic_modeling/archetypes/topics'
topic_rows = []

for f in os.listdir(topic_folder): # add all archetype description texts
    file = topic_folder + '/' + f
    topic_text = get_doc_text(file)
    topic_text = [t for t in topic_text if t in model.wv.vocab]
    
    top_hero_score = 0
    top_hero = ''
    for (name, arch_text) in zip(hero_names, hero_texts):
        score = model.n_similarity(topic_text, arch_text)
        if score > top_hero_score:
            top_hero_score = score
            top_hero = name
            
    top_heroine_score = 0
    top_heroine = ''
    for (name, arch_text) in zip(heroine_names, heroine_texts):
        score = model.n_similarity(topic_text, arch_text)
        if score > top_heroine_score:
            top_heroine_score = score
            top_heroine = name
    
    row = [f.replace('.txt',''), top_hero, top_hero_score, top_heroine, top_heroine_score, ' '.join(topic_text)]
    topic_rows.append(row)

sim_df = pd.DataFrame(topic_rows, columns=['topic','top_hero','hero_score','top_heroine','heroine_score','topic_keys'])
sim_df.to_csv('data/topic_modeling/archetypes/hero-topic-similarity.csv', index=False)

  import sys


In [263]:
common_texts = []
common_names = []
common_folder = os.getcwd() + '/data/topic_modeling/archetypes/jungian'

for f in os.listdir(common_folder): # add all archetype description texts
    file = common_folder + '/' + f
    common_texts.append(get_doc_text(file))
    common_names.append(f.replace('.txt',''))

In [264]:
topic_folder = os.getcwd() + '/data/topic_modeling/archetypes/topics'
topic_rows = []

for f in os.listdir(topic_folder): # add all archetype description texts
    file = topic_folder + '/' + f
    topic_text = get_doc_text(file)
    topic_text = [t for t in topic_text if t in model.wv.vocab]
    
    top_score = 0
    top_name = ''
    for (name, arch_text) in zip(common_names, common_texts):
        arch_text = [t for t in arch_text if t in model.wv.vocab]
        score = model.n_similarity(topic_text, arch_text)
        if score > top_score:
            top_score = score
            top_name = name
            
    row = [f.replace('.txt',''), top_name, top_score, ' '.join(topic_text)]
    topic_rows.append(row)

sim_df = pd.DataFrame(topic_rows, columns=['topic','top_archetype','score','topic_keys'])
sim_df.to_csv('data/topic_modeling/archetypes/jungian-topic-similarity.csv', index=False)

  import sys
  if sys.path[0] == '':


In [271]:
topic16words = get_doc_text(topic_folder + '/topic16.txt')

In [281]:
common_names.index('orphan')

9

In [282]:
arch_text = [t for t in common_texts[9] if t in model.wv.vocab]
model.n_similarity(arch_text,topic5_words)

  """Entry point for launching an IPython kernel.


0.7694518

In [238]:
model.n_similarity(heroine_texts[5],topic5_words)

0.762655

In [239]:
model.n_similarity(heroine_texts[4],topic5_words)

0.7458608

In [260]:
heroine_texts[3]

['nurturer',
 'kind',
 'compassionate',
 'woman',
 'sacrifices',
 'much',
 'order',
 'help',
 'others',
 'particularly',
 'children',
 'feels',
 'responsible',
 'whole',
 'identity',
 'tied',
 'caring',
 'others',
 'controlling',
 'mother',
 'woman',
 'butts',
 'childrens',
 'lives',
 'need',
 'needed',
 'taken',
 'extremes',
 'might',
 'even',
 'kidnap',
 'children',
 'tried',
 'leave']

In [256]:
topic20words

['men',
 'war',
 'sir',
 'soldiers',
 'army',
 'general',
 'man',
 'colonel',
 'day',
 'children',
 'back',
 'fire',
 'ext',
 'lieutenant',
 'people',
 'enemy',
 'fight',
 'int',
 'boat',
 'soldier']