In [None]:
import re
import string
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics import pairwise_distances
%matplotlib inline

In [None]:
with open('all_books_df.pkl', 'rb') as file:
    df = pkl.load(file)

In [None]:
all_books = df['documents'].to_list()

In [None]:
def display_topics(model, feature_names, num_top_words, topic_names=None):
    
    """
    Displays the topics generated by a specified model, along with the specified number of
    top words associated with that topic.
    
    Parameters:
    model (sklearn.decomposition): sklearn fitted model (TruncatedSVD or NMF)
    feature_names (list): feature names from vectorizer (using vectorizer.get_feature_names())
    num_top_words (int): number of words to display with each topic
    topic_names (list): custom topic names (str) for each topic; 
                        length must be equivalent to number of topics generated by model
                        
    Returns:
    topic_words (list): list of lists containing the words (str) associated with each topic
    """
    
    topic_words = []
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("Topic ", ix)
        else:
            print(f"Topic: {topic_names[ix]}")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
        topic_words.append([", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]])])
    return topic_words

In [None]:
custom_stopwords = list(stopwords.words('english')) + \
                   ['date', 'pron', 'google', 'understandingrelationships', 
                    'corey', 'wayne', 'man', 'male', 'males', 'masculine', 
                    'feminine', 'female', 'females', 'woman', 'men', 'women', 
                    'love', 'girl', 'boy', 'wife', 'husband', 'chapter', 'qxp',
                    'orange', 'guy', 'boyfriend', 'girlfriend', 'three', 'five', 
                    'four', 'wil', 'al', 'bounddate', 'texte', 'half', 'island', 'wave', 
                    'spouse']

In [None]:
# vectorize documents using TFIDF
tfidf = TfidfVectorizer(stop_words=custom_stopwords, max_df=0.06)
tfidf.fit(all_books)
X_tfidf = tfidf.transform(all_books)

# Topic Modeling with Non-negative Matrix Factorization

In [None]:
##### load pre-trained model #####
with open('/Users/winstonma4/Metis/project4/all_model.pkl', 'rb') as file:
    nmf_model = pkl.load(file)

In [None]:
##### train new model #####
nmf_model = NMF(18)
nmf_model.fit(X_tfidf)

In [None]:
topics_nmf = nmf_model.transform(X_tfidf)

In [None]:
# custom named topics from pre-trained model
topics = ['Long-Term Commitment', 'Initial Contact', 'Boundaries', 'Behavioral Psychology', 'Life Appreciation',
          'Attractive Behavior', 'Phases of Dating', 'Meeting People', 'Online Dating', 'Flirting and Escalation', 'Communication Styles', 'Heartbreak', 
          'Dreaming Big', 'Science of Seduction', 'Primal Instincts', 'Family', 'Personality Types', 'Being Alpha']

In [None]:
topic_words = display_topics(nmf_model, tfidf.get_feature_names(), 10, topics)

In [None]:
df['topic'] = [topics[ix] for ix in topics_nmf.argmax(axis=1)]

In [None]:
with open('/Users/winstonma4/Metis/project4/nmf_model.pkl', 'wb') as file:
    pkl.dump(nmf_model, file)

In [None]:
with open('/Users/winstonma4/Metis/project4/df.pkl', 'wb') as file:
    pkl.dump(df, file)

# Flask App Files
This section aims to create a DataFrame containing book titles and their respective vectors for the 18 topics. <i>books_df.pkl</i> is the output file which is used in the Flask app.

In [None]:
topics_df = pd.DataFrame(topics_nmf, columns=[f'topic_{i}' for i in range(18)])

In [None]:
df = df.reset_index().drop('index', axis=1)

In [None]:
topics_df = pd.concat([df, topics_df], axis=1)

In [None]:
book_vectors = topics_df.groupby('book')[['topic_0', 'topic_1', 'topic_2', 
                                          'topic_3', 'topic_4', 'topic_5', 
                                          'topic_6', 'topic_7', 'topic_8', 
                                          'topic_9', 'topic_10', 'topic_11', 
                                          'topic_12', 'topic_13', 'topic_14', 
                                          'topic_15', 'topic_16', 'topic_17']].sum()

In [None]:
total_vector = np.sum(book_vectors.values, axis=1)

In [None]:
prob_vectors = book_vectors.values / np.repeat(total_vector, 18).reshape(-1, 18)

In [None]:
book_vectors_prob = pd.DataFrame(prob_vectors, index=book_vectors.index, 
                                 columns=[f'topic_{i}' for i in range(18)])

In [None]:
hue_by_gender_df = df.groupby(['book', 'audience'])['author_gender'].min().reset_index()

In [None]:
books_df = book_vectors_prob.merge(hue_by_gender_df, how='left', on='book')

In [None]:
with open('/Users/winstonma4/Metis/project4/flask_app/books_df.pkl', 'wb') as file:
    pkl.dump(books_df, file)

# Topic Modeling with LDA
Included here as it was attempted, but results were surprisingly not as good.

In [None]:
from gensim import corpora, models, similarities, matutils

In [None]:
def get_lda_model(data, vectorizer, num_topics=10, passes=5):    
    data_T = data.transpose()
    corpus = matutils.Sparse2Corpus(data_T)
    id2word = {v:k for k, v in vectorizer.vocabulary_.items()}
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes)
    return lda

In [None]:
model = get_lda_model(X_tfidf, tfidf)

In [None]:
model.print_topics()

# TSNE Plot
Used for 2d visualizations the 18 topics.

In [None]:
from sklearn.manifold import TSNE

In [None]:
X_tsne = topics_df[['topic_0', 'topic_1', 'topic_2', 'topic_3',
           'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9',
           'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 'topic_15',
           'topic_16', 'topic_17']]

In [None]:
X_embedded = TSNE(n_components=2, metric='cosine').fit_transform(X_tsne)

In [None]:
X_embedded.shape

### Colored by author gender

In [None]:
plt.figure(figsize=(22,16))
sns.scatterplot(X_embedded[:, 0], X_embedded[:, 1], hue=topics_df['author_gender'], palette='Paired')
plt.legend(loc=(1.04, 0), fontsize=30, markerscale=5)

### Separated by all 18 topics to CSV

In [None]:
tsne_df = pd.DataFrame({'x': X_embedded[:,0],
              'y': X_embedded[:,1],
              'Topic': df['topic']})

In [None]:
tsne_df.to_csv('/Users/winstonma4/Metis/project4/tsne.csv')

### Separated by author gender to CSV

In [None]:
author_tsne_df = pd.DataFrame({'x': X_embedded[:,0],
              'y': X_embedded[:,1],
              'Author': df['author_gender']})

In [None]:
author_tsne_df.to_csv('/Users/winstonma4/Metis/project4/author_tsne.csv')