# Import

In [1]:
from bs4 import BeautifulSoup

import re
import sys
import string
import json
import random

from datetime import datetime
from dateutil.parser import parse

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pdb
from pymongo import MongoClient
from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateMany, UpdateOne
from pprint import pprint

from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import OrderedDict
from nltk.stem.snowball import SnowballStemmer

from sklearn.decomposition import NMF


import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from gensim.test.utils import common_texts, common_corpus, common_dictionary


# Text Pre-Processing Functions

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
lemmatizer.lemmatize

<bound method WordNetLemmatizer.lemmatize of <WordNetLemmatizer>>

# Load Document Term Matrices

## Load Dataset
R/Polyamory, 2012 and 2019

In [4]:
suffix = 'r_poly_features_stemmed_cv'
filename = '../data/exports/doc_term/r_polyamory_2012and2019/doc_term_'+suffix+'.pkl'

with open(filename, 'rb') as file:
    doc_term = pickle.load(file)
    
with open('../data/exports/doc_term/r_polyamory_2012and2019/date_index.pkl', 'rb') as file:
    doc_dates = pickle.load(file)

In [5]:
# All concatenated / flatted threads of r_polyamory from 2012 and 2019
with open('../data/exports/by_thread/r_poly_2012_and_2019.pkl', 'rb') as file:
    r_poly_threads = pickle.load(file)

# Define Custom Stop Words

In [6]:
swds_reddit_and_web = ([
    'reddit', 'subreddit', 'posts','comments'
    'html', 'com','utm', 'www', 'http',
    'sub', # not sure if this one should be included...
    'medium',
    'deleted', 'delete', 'removed',
    'x200b',
])

swds_relationships = ([
    
])

swds_polyamory_terms = ([
    'polyamory', 'poly'
])

swds_artifacts = ([
    've','don'
])

def custom_stop_words(custom_stop_lists, stop_english=True):
    '''
    
    Takes a list of lists of custom stop words,
    and forms a custom stop words list.
    
    '''
    
    
    swds_custom = []
    
    if stop_english:
        swds_custom.extend(stopwords.words('english'))
    
    
    for wordlist in custom_stop_lists:
#         print(wordlist)
        if len(wordlist)>0:
            swds_custom.extend(wordlist)
    
    return swds_custom

In [46]:
custom_stop_words([swds_reddit_and_web], stop_english=False)

['reddit',
 'subreddit',
 'posts',
 'commentshtml',
 'com',
 'utm',
 'www',
 'http',
 'sub',
 'medium',
 'deleted',
 'delete',
 'removed',
 'x200b']

In [47]:
custom_stop_words([swds_reddit_and_web, swds_artifacts], stop_english=False)

['reddit',
 'subreddit',
 'posts',
 'commentshtml',
 'com',
 'utm',
 'www',
 'http',
 'sub',
 'medium',
 'deleted',
 'delete',
 'removed',
 'x200b',
 've',
 'don']

# NMF

## Load/Select Corpora

In [7]:
df = pd.DataFrame.from_dict(r_poly_threads)
df.columns

Index(['date', 'text_concat'], dtype='object')

In [15]:
df.head(1)

Unnamed: 0,date,text_concat
0,2019-11-05 04:13:35,I’ve seen a couple of posts stating that so...


In [8]:
# Define Corpora
corpora = df['text_concat']

## Fit Model, View Topics

In [10]:
# Define Vectorizer
swds = custom_stop_words([swds_reddit_and_web, swds_artifacts])
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words=swds)

# Fit vectorizer to create document-term matrix
doc_term_matrix = tfidf_vect.fit_transform(corpora.values.astype('U'))

In [63]:
# Topic Modeling
nmf = NMF(n_components=10, random_state=42)

nmf.fit(doc_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [12]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 10 words for topic #0:
['one', 'romantic', 'partner', 'polyamorous', 'months', 'wants', 'opening', 'polyamory', 'needs', 'new', 'end', 'work', 'together', 'monogamous', 'long', 'years', 'want', 'relationships', 'open', 'relationship']


Top 10 words for topic #1:
['others', 'mean', 'everyone', 'someone', 'way', 'non', 'many', 'multiple', 'different', 'polyamorous', 'partners', 'monogamous', 'person', 'like', 'monogamy', 'think', 'one', 'relationships', 'polyamory', 'people']


Top 10 words for topic #2:
['great', 'one', 'first', 'bf', 'good', 'told', 'gf', 'cute', 'night', 'friends', 'together', 'went', 'friend', 'got', 'like', 'really', 'us', 'girlfriend', 'happy', 'boyfriend']


Top 10 words for topic #3:
['condom', 'use', 'physical', 'romantic', 'intimate', 'asexual', 'oral', 'libido', 'casual', 'desire', 'get', 'drive', 'tested', 'intimacy', 'sexually', 'risk', 'partners', 'condoms', 'sexual', 'sex']


Top 10 words for topic #4:
['nre', 'feel', 'like', 'much', 'make', 'days', '

In [28]:
len(tfidf_vect.get_feature_names())

37788

In [29]:
len(nmf.components_[0])

37788

In [64]:
topics_df = pd.DataFrame()
topic_dict = {}

topics_list = []

for i,component in enumerate(nmf.components_):
#     print(f'Top 10 words for topic #{i}:')
    top_words = [(tfidf_vect.get_feature_names()[i], round(component[i], 3)) for i in reversed(component.argsort()[-20:])]
#     print(top_words)
#     print('\n')
    topic_dict[i] = top_words
#     for (word, importance) in top_words:
#         topic_dict[i] += [(word, importance)]

    topics_list += [[i, word, importance] for (word, importance) in top_words]

In [66]:
topic_output = pd.DataFrame.from_records(topics_list, columns=['topic', 'word', 'component'])

In [67]:
suffix = 'r_relationships_2012and2019_10_topics.csv'

with open('../data/for_visualization/' + suffix, 'w') as file:
    file.write(topic_output.to_csv())

In [58]:
# Topic Modeling
nmf = NMF(n_components=15, random_state=42)

nmf.fit(doc_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=15, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [59]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 10 words for topic #0:
['us', 'romantic', 'new', 'would', 'term', 'end', 'person', 'work', 'needs', 'secondary', 'one', 'years', 'monogamous', 'long', 'together', 'primary', 'open', 'want', 'relationships', 'relationship']


Top 10 words for topic #1:
['lot', 'multiple', 'monogamous', 'everyone', 'non', 'way', 'polyamorous', 'someone', 'many', 'friends', 'different', 'partners', 'person', 'monogamy', 'polyamory', 'like', 'think', 'relationships', 'one', 'people']


Top 10 words for topic #2:
['great', 'well', 'told', 'night', 'know', 'gf', 'years', 'first', 'good', 'went', 'together', 'got', 'girlfriend', 'friend', 'like', 'really', 'us', 'friends', 'happy', 'boyfriend']


Top 10 words for topic #3:
['casual', 'men', 'unprotected', 'would', 'libido', 'condom', 'women', 'oral', 'desire', 'want', 'get', 'drive', 'tested', 'intimacy', 'sexually', 'risk', 'partners', 'condoms', 'sexual', 'sex']


Top 10 words for topic #4:
['needs', 'dates', 'make', 'days', 'feel', 'home', 'things', 'd

In [62]:
# Topic Modeling
nmf = NMF(n_components=10, random_state=42)

nmf.fit(doc_term_matrix)

for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

KeyboardInterrupt: 

In [61]:
# Topic Modeling
nmf = NMF(n_components=5, random_state=42)

nmf.fit(doc_term_matrix)

for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 10 words for topic #0:
['get', 'time', 'talk', 'make', 'something', 'way', 'work', 'going', 'someone', 'things', 'really', 'think', 'know', 'need', 'would', 'feelings', 'feel', 'like', 'want', 'relationship']


Top 10 words for topic #1:
['https', 'like', 'non', 'find', 'someone', 'polyamorous', 'think', 'open', 'love', 'person', 'dating', 'monogamy', 'mono', 'one', 'monogamous', 'polyamory', 'relationships', 'relationship', 'people', 'poly']


Top 10 words for topic #2:
['good', 'first', 'night', 'friends', 'really', 'girlfriend', 'get', 'kids', 'years', 'like', 'one', 'family', 'happy', 'boyfriend', 'together', 'us', 'love', 'time', 'husband', 'wife']


Top 10 words for topic #3:
['man', 'wife', 'sexually', 'risk', 'also', 'someone', 'partners', 'condoms', 'one', 'woman', 'get', 'think', 'want', 'would', 'like', 'people', 'men', 'women', 'sexual', 'sex']


Top 10 words for topic #4:
['nesting', 'someone', 'want', 'feel', 'like', 'need', 'needs', 'person', 'secondary', 'relationsh

In [62]:
# Topic Modeling
nmf = NMF(n_components=25, random_state=42)

nmf.fit(doc_term_matrix)

for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 10 words for topic #0:
['say', 'tell', 'try', 'change', 'feelings', 'give', 'think', 'go', 'someone', 'may', 'might', 'needs', 'ask', 'talk', 'make', 'know', 'would', 'wants', 'need', 'want']


Top 10 words for topic #1:
['romantic', 'everyone', 'way', 'others', 'life', 'someone', 'many', 'non', 'different', 'multiple', 'partners', 'polyamorous', 'monogamous', 'person', 'think', 'one', 'monogamy', 'relationships', 'polyamory', 'people']


Top 10 words for topic #2:
['sharing', 'wonderful', 'nice', 'glad', 'lovely', 'sweet', 'good', 'hope', 'guys', 'thanks', 'amazing', 'congrats', 'great', 'lol', 'awesome', 'adorable', 'beautiful', 'thank', 'cute', 'happy']


Top 10 words for topic #3:
['partner', 'partners', 'swinging', 'emotional', 'attracted', 'ace', 'enjoy', 'connection', 'physical', 'intimate', 'romantic', 'asexual', 'casual', 'libido', 'desire', 'drive', 'sexually', 'intimacy', 'sexual', 'sex']


Top 10 words for topic #4:
['new', 'home', 'long', 'every', 'spending', 'see', 'w

## Cluster Strength Optimization

In [None]:
# Graph Reconstruction Error
recon_errors = []

for n_components in range(2, 40):

    nmf_model = NMF(n_components=n_components)
    doc_topic = nmf_model.fit_transform(coo.toarray())
    recon_errors += [[n_components, nmf_model.reconstruction_err_]]

df = pd.DataFrame(recon_errors, columns = ['N', 'Reconstruction Error'])
df