In [27]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re 
import pickle
import gensim, logging
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel, ldamodel
import psycopg2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marieskoczylas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marieskoczylas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/marieskoczylas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
conn = psycopg2.connect(host = "localhost", dbname = "rolemodel", user = "marieskoczylas", port=5432)
cur = conn.cursor()
conn

<connection object at 0x1a252f2190; dsn: 'host=localhost dbname=rolemodel user=marieskoczylas port=5432', closed: 0>

In [29]:
cur.execute("SELECT profile_id, profile FROM profiles")
biglist = cur.fetchall()
print(biglist)



In [30]:
# Lowercase the profile text

biglistlower = []

for profiletuple in biglist:
    biglistlower.append((profiletuple[0], profiletuple[1].lower()))

print(biglistlower)



In [31]:
# Tokenize (and remove punctuation and new lines [eventually do this part])

from nltk.tokenize import RegexpTokenizer
re_tokenizer = RegexpTokenizer(r'\w+')

biglisttokens = []

for profiletuple in biglistlower:
    biglisttokens.append((profiletuple[0], re_tokenizer.tokenize(profiletuple[1])))

print(biglisttokens)



In [32]:
# Stopword removal

stop_words = set(stopwords.words('english'))

def remove_stopwords(texts):
    output = []
    for word in texts:
        processed = simple_preprocess(str(word))
        # removes "empty" processed words and stopwords
        if len(processed) > 0 and processed[0] not in stop_words:  
            output.append(processed)            
    return output

biglistclean = []

for profiletuple in biglisttokens:
    biglistclean.append((profiletuple[0], remove_stopwords(profiletuple[1])))

print(biglistclean)

#AND ADDING STOPWORDS (for later)
#stopwords = nltk.corpus.stopwords.words('english')
#newStopWords = ['stopWord1','stopWord2']
#stopwords.extend(newStopWords)



In [7]:
# In the future I could do some lemmatization to reduce differential forms of a word to a common base form.
# Lemmitization is a more sophisticated process than stemming - it can account for variables such as part-of-speech, meaning, and context within a document or neighboring sentences.
# But I'd need to fix because lemmatize expects a string not a list of strings.

#lemmatizer = WordNetLemmatizer()
#biglistlems = []

#for profiletuple in biglistclean:
#    biglistlems.append((profiletuple[0], lemmatizer.lemmatize(profiletuple[1])))

#print(biglistlems)

In [33]:
# Finally, vectorization: representing the text as a quantitative set of features for subsequent analysis.

# Input: cleanedwords [(id, [[tokens]])]
# Output: [(id, [(word id, word frequency)], id2word)]
def vector_this(cleanedwords):
    biglistvec = []
    for tokentuple in cleanedwords:
        tokens = tokentuple[1]
        id2wordtest = corpora.Dictionary(tokens)
        corpora_id = [id2wordtest.doc2bow(token) for token in tokens]
        biglistvec.append((tokentuple[0], corpora_id, id2wordtest))
    return biglistvec

bigcorpus = vector_this(biglistclean)
print(bigcorpus)

[(998, [[(0, 1)], [(1, 1)], [(2, 1)], [(3, 1)], [(4, 1)], [(5, 1)], [(6, 1)], [(1, 1)], [(7, 1)], [(8, 1)], [(9, 1)], [(10, 1)], [(11, 1)], [(12, 1)], [(13, 1)], [(14, 1)], [(15, 1)], [(16, 1)], [(17, 1)], [(18, 1)], [(19, 1)], [(20, 1)], [(21, 1)], [(22, 1)], [(23, 1)], [(24, 1)], [(25, 1)], [(26, 1)], [(27, 1)], [(1, 1)], [(28, 1)], [(29, 1)], [(30, 1)], [(31, 1)], [(32, 1)], [(33, 1)], [(34, 1)], [(35, 1)], [(36, 1)], [(4, 1)], [(37, 1)], [(6, 1)], [(38, 1)], [(39, 1)], [(1, 1)], [(40, 1)], [(41, 1)], [(42, 1)], [(43, 1)], [(44, 1)], [(45, 1)], [(46, 1)], [(47, 1)], [(30, 1)], [(48, 1)], [(49, 1)], [(50, 1)], [(51, 1)], [(52, 1)], [(26, 1)], [(53, 1)], [(54, 1)], [(55, 1)], [(53, 1)], [(56, 1)], [(57, 1)], [(58, 1)], [(59, 1)], [(60, 1)], [(61, 1)], [(62, 1)], [(63, 1)], [(64, 1)], [(65, 1)], [(66, 1)], [(67, 1)], [(68, 1)], [(69, 1)], [(70, 1)], [(71, 1)], [(72, 1)], [(73, 1)], [(74, 1)], [(69, 1)], [(1, 1)], [(75, 1)], [(76, 1)], [(10, 1)], [(8, 1)], [(77, 1)], [(78, 1)], [(73, 1)

In [34]:
# LDA Model, run on one profile

lda_rolemodel = gensim.models.ldamodel.LdaModel(corpus=bigcorpus[0][1],
                                           id2word=bigcorpus[0][2],
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_rolemodel.print_topics()

[(0,
  '0.111*"ny" + 0.066*"institute" + 0.050*"gay" + 0.050*"picture" + 0.050*"external" + 0.050*"links" + 0.050*"transgender" + 0.049*"november" + 0.038*"collection" + 0.019*"publications"'),
 (1,
  '0.075*"minneapolis" + 0.061*"biographical" + 0.052*"national" + 0.045*"public" + 0.036*"aperture" + 0.033*"dallas" + 0.032*"germany" + 0.027*"texas" + 0.024*"permanent" + 0.022*"commire"'),
 (2,
  '0.101*"changing" + 0.066*"photographic" + 0.062*"edited" + 0.057*"jessica" + 0.057*"teicher" + 0.057*"queer" + 0.034*"view" + 0.029*"ref" + 0.027*"thames" + 0.024*"september"'),
 (3,
  '0.169*"photography" + 0.081*"history" + 0.072*"hank" + 0.045*"writing" + 0.040*"valens" + 0.031*"yochelson" + 0.028*"ct" + 0.022*"crown" + 0.020*"exile" + 0.020*"éditions"'),
 (4,
  '0.577*"york" + 0.031*"december" + 0.030*"smithsonian" + 0.023*"portraits" + 0.019*"bernice" + 0.012*"international" + 0.009*"photographe" + 0.008*"book" + 0.003*"quickly" + 0.003*"patterns"'),
 (5,
  '0.227*"women" + 0.072*"haaften

In [42]:
# Running this model on the entire body of text.

# Next, figure out sql query for insertion of rolemodeltopics into profiles "topics" column, that inserts topics into correct id row

def create_profile_topics(profile_corpus):
    profile_id = profile_corpus[0]
    if len(profile_corpus[1]) < 1:
        return (profile_id, "")
    lda_rolemodel = gensim.models.ldamodel.LdaModel(corpus=profile_corpus[1],
                                           id2word=profile_corpus[2],
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    profile_topics = lda_rolemodel.print_topics()
    profile_topics = (profile_id, profile_topics)
    return profile_topics

# I'm doing this because bigcorpus is a list but my function does not take a list. So I can't just do this: create_profile_topics(bigcorpus)
profile_topics = list(map(create_profile_topics, bigcorpus))




In [41]:
print(profile_topics)

998


In [56]:
for profile_topic in profile_topics:
    cur.execute("update profiles set topics = %s where profile_id = %s", 
               (profile_topic[1], profile_topic[0]))
    
conn.commit()

#UPDATE Customers
#SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
#WHERE CustomerID = 1;

#wiki_wiki = wikipediaapi.Wikipedia('en')

#cat_women_inventors = wiki_wiki.page("Category:Women inventors")

#profiles_api_list = []

#all_profiles = cat_women_inventors.categorymembers

#print(len(all_profiles))

#for name, page in all_profiles.items():
#    info = {
#        "name": name,
#        "url": page.fullurl,
#        "text": page.text,
#    }
#    profiles_api_list.append(info)

#for profiledict in profiles_api_list:
#    cur.execute("INSERT INTO profiles (fullname, hyperlink, profile) VALUES (%s, %s, %s)", 
#               (profiledict["name"], profiledict["url"], profiledict["text"]))
    
#conn.commit()

In [64]:
cur.execute('select topics from profiles')
print(cur.fetchall())

[('{"(0,\\"0.197*\\"\\"raphael\\"\\" + 0.068*\\"\\"colonel\\"\\" + 0.033*\\"\\"ludwig\\"\\" + 0.030*\\"\\"family\\"\\" + 0.017*\\"\\"breeches\\"\\" + 0.017*\\"\\"wear\\"\\" + 0.017*\\"\\"blessing\\"\\" + 0.017*\\"\\"fur\\"\\" + 0.017*\\"\\"clothing\\"\\" + 0.017*\\"\\"historic\\"\\"\\")","(1,\\"0.071*\\"\\"general\\"\\" + 0.036*\\"\\"order\\"\\" + 0.036*\\"\\"dead\\"\\" + 0.036*\\"\\"commemorating\\"\\" + 0.036*\\"\\"valued\\"\\" + 0.036*\\"\\"dedicated\\"\\" + 0.036*\\"\\"oceanic\\"\\" + 0.036*\\"\\"ontario\\"\\" + 0.036*\\"\\"number\\"\\" + 0.029*\\"\\"th\\"\\"\\")","(2,\\"0.252*\\"\\"lady\\"\\" + 0.064*\\"\\"crew\\"\\" + 0.052*\\"\\"aviation\\"\\" + 0.043*\\"\\"brothers\\"\\" + 0.022*\\"\\"headed\\"\\" + 0.022*\\"\\"wahlstatt\\"\\" + 0.022*\\"\\"communications\\"\\" + 0.022*\\"\\"von\\"\\" + 0.022*\\"\\"plunged\\"\\" + 0.022*\\"\\"remaining\\"\\"\\")","(3,\\"0.025*\\"\\"similar\\"\\" + 0.025*\\"\\"consisted\\"\\" + 0.025*\\"\\"discarded\\"\\" + 0.025*\\"\\"occupants\\"\\" + 0.025*\\