## Topic Modeling and Visualization

In [1]:
#General Imports
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#These functions and part of the final visualization functions are based on the following repo: 
#https://github.com/juanshishido/okcupid
from utils.nonnegative_matrix_factorization import nmf_labels, nmf_inspect
from utils.splits import subset_df, group_pct
from utils.text_representation import feature_vectors
from utils.lexical_features import *

ModuleNotFoundError: No module named 'utils.spacy_tokenizer'

#### First, we generate the topics and assign some meaning to them

In [None]:
df = pd.read_csv('compressed_okcupid.csv')

In [None]:
#The major part of the algorithm- can take some time
specs = {'stop_words' : 'english', 'ngram_range' : (1, 3), 'min_df' : 0.005}
counts, tfidf, vocab = feature_vectors(df.essay0, specs)

In [None]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)

In [None]:
#These labels are based on the categories as assessed by Juan Shishido, then modified by me
labels=['Reach Out!','Relocated', 'About Me', 'Hesitation', 'Casual', 'The City',
       'Novelty', 'Cool', 'Likes', 'Passions', 'Easy Going', 'Region', 'Seeking', 'Thoughts', 'Fun', 'New Here',
        'Travel','Self-summary', 'Nots', 'Growing Up','Carpe Diem', 'Good Company','Hobbies',
        'Cultural Interests', 'Ambitious']

label_dict = {}
for c, value in enumerate(labels):
    label_dict[c] = value
print(label_dict)

#### Next, we find a way of calculating and visualizing these topic distributions across our 4 chosen demographic variables

In [None]:
def split_by_demog(model, feature_names, n_top_words):
    """For printing the `n_top_words` for each grouping

    Parameters
    ----------
    model : sklearn.decomposition.nmf.NMF
        The NMF object

    feature_names : list
        The output from calling `TfidfVectorizer` on the users/features data

    n_top_words : int
        The top n words to print for a particular grouping

    Returns
    -------
    None
    """
    for topic_idx, topic in enumerate(model.components_):
        print("Group %d:" % topic_idx)
        print(" | ".join([feature_names[i]
            for i in topic.argsort()[ : -n_top_words-1 : -1]]))
        print()
    print()

In [None]:
def get_label(group_num):
    return label_dict[group_num]

def format_df(df, demog, tfidf): 
    df['group'] = nmf_labels(tfidf, k=K)
    subset = subset_df(df, demog, df[demog].unique())
    grouped = group_pct(subset, demog)
    percent_only = grouped.drop(['count_x', 'count_y'], axis=1)
    #percent_only
    pivoted = percent_only.pivot(index='group', columns=demog)
    pivoted['max_value'] = pivoted.max(axis=1)
    ordered_df = pivoted.sort_values(by='max_value', ascending=True)
    #Getting rid of the multi-line index
    ordered_df.columns = ordered_df.columns.droplevel(0)
    ordered_df = ordered_df.reset_index().rename_axis(None, axis=1)
    #Renaming the max
    ordered_df = ordered_df.rename(columns={'':'max'})
    #Linking to label
    ordered_df['label'] = ordered_df['group'].apply(get_label)
    return ordered_df

In [None]:
height_df, race_df, edu_df, fit_df= format_df(df, 'height_group', tfidf), 
                                    format_df(df, 'race_ethnicity', tfidf), 
                                    format_df(df, 'edu', tfidf), 
                                    format_df(df, 'fit', tfidf)

In [None]:
#Plot for Education Levels
ordered_df = edu_df
import matplotlib.patches as mpatches

my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['High School or less'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['More than High School'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Education Levels", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic Most', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='More than High School')
blue_patch = mpatches.Patch(color='blue', label='Less than High School')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('opinions.png', bbox_inches='tight')


In [None]:
#Plot for Fitness Levels
ordered_df = fit_df
import matplotlib.patches as mpatches

my_range=range(1,len(fit_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['fit'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_fit'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Fitness Levels", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic Most', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Fit')
blue_patch = mpatches.Patch(color='blue', label='Not Fit')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('fit.png', bbox_inches='tight')

In [None]:
#The Plot for Height
ordered_df = height_df
import matplotlib.patches as mpatches

my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['short'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_short'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Height Groups", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic Most', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Short')
blue_patch = mpatches.Patch(color='blue', label='Not Short')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('height.png', bbox_inches='tight')

In [None]:
# The Plot for Races
ordered_df = race_df
my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['White'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['Black'], my_range, "o", markersize=20, color='red')
plt.plot(ordered_df['Asian'], my_range, "o", markersize=20, color='green')
plt.plot(ordered_df['Latinx'], my_range, "o", markersize=20, color='cyan')
plt.plot(ordered_df['multiple'], my_range, "o", markersize=20, color='magenta')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Racial Groups", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic Most', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
blue_patch = mpatches.Patch(color='blue', label='White')
maroon_patch = mpatches.Patch(color='red', label='Black')
green_patch = mpatches.Patch(color='green', label='Asian')
cyan_patch = mpatches.Patch(color='cyan', label='Latinx')
magenta_patch = mpatches.Patch(color='magenta', label='multiple')
plt.legend(handles=[maroon_patch, blue_patch, green_patch, cyan_patch, magenta_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('race.png', bbox_inches='tight')