### Topic Modeling of Comments

In [6]:
import os
import json
import pandas as pd
import numpy as np

In [7]:
dir_path = "/users/sandyliu/CS315-proj3-group2/1-data_collection/" #CHANGE ME!

#### Data Cleaning

In [8]:
def get_description(gender):
    if gender=="M":
        filepath = "output_male/"
    elif gender=="F":
        filepath = "output_female/"
    else:
        print("Not a valid input.")
    files = os.listdir(dir_path + filepath)

    #all_comments = []
    all_description = []
    for file in files:
        with open(dir_path + filepath + file, 'r') as f:
            account = json.load(f)
        f.close()

        for video in account:
            vid_desc = ""
            try: 
                for parts in video["description"]:
                    vid_desc += parts.lower()
                all_description.append(vid_desc)
            except:
                pass

            # for comment in video["comments"]:
            #     all_comments.append(comment.lower())
        
    return pd.DataFrame(data=all_description, columns=["description"]) #, all_comments

df_M = get_description("M")
df_F = get_description("F")

print(df_F["description"]) #each row is a video

# possible future works: try it with comments?

0      we all know who the grand puppet master is in ...
1      if the majority wants to talk about dark money...
2      the party of ‘law and order” 🤔lol, they litera...
3      no one should have the right to tell women wha...
4      i ran for congress because i was sick of polit...
                             ...                        
441                                       we vote within
442    as long as you’re in line by 8pm the polls mus...
443              be like yama and make your voice heard!
444    don’t mail your ballot, drop it off at a dropb...
445    be sure to fill out your entire ballot! if you...
Name: description, Length: 446, dtype: object


#### Converting corpus into document-term (dtm) matrix

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def to_dtm(words):
    vectorizer = CountVectorizer(  #initialize vectorizer
        strip_accents='unicode',
        stop_words='english',
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{3,}\b') # we want only words that contain letters and are 3 or more characters long

    # Transform our data into the document-term matrix
    dtm = vectorizer.fit_transform(words)
    feature_names = vectorizer.get_feature_names_out()
    return dtm, feature_names

dtm_F = to_dtm(df_F["description"])
dtm_M = to_dtm(df_M["description"])

In [147]:
def matrix2Doc(dtMatrix, features, index):
    """Turns each row of the document-term matrix into a list of terms"""
    row = dtMatrix.getrow(index).toarray()
    non_zero_indices = row.nonzero()[1]
    words = [features[idx] for idx in non_zero_indices]
    return words

def convert_to_word(df):
    dtm = to_dtm(df["description"])[0]
    feature_names = to_dtm(df["description"])[1]
    allDocsAsTerms = [matrix2Doc(dtm, feature_names, i) for i in range(dtm.shape[0])]
    return allDocsAsTerms

pd.set_option("display.max_colwidth",1000)
df_F["terms"] = convert_to_word(df_F)
df_M["terms"] = convert_to_word(df_M)

In [149]:
df_F # video descriptions of female politicians

Unnamed: 0,description,terms
0,we all know who the grand puppet master is in this impeachment. the long arm (but small hands) of donald trump and his fingerprints are all over this hearing and gop shut down.,"[arm, donald, fingerprints, gop, grand, hands, hearing, impeachment, know, long, master, puppet, shut, small, trump]"
1,"if the majority wants to talk about dark money and activist courts, i am so here for it.so, let’s talk about how right wing organizations bankroll judicial decisions to undermine our fundamental rights and judicial system.","[activist, bankroll, courts, dark, decisions, fundamental, judicial, let, majority, money, organizations, right, rights, talk, undermine, wants, wing]"
2,"the party of ‘law and order” 🤔lol, they literally have a history of interfering in criminal investigations for political gain.today’s oversight hearing is purely a political stunt. period.","[criminal, gain, hearing, history, interfering, investigations, law, literally, lol, order, oversight, party, period, political, purely, stunt, today]"
3,"no one should have the right to tell women what we can and can't do with our bodies—including the supreme court.right now scotus is considering whether or not women should have access to life saving medications like mifepristone. the law must follow the science and protect our rights, not a political agenda.","[access, agenda, bodies, considering, court, follow, including, law, life, like, medications, mifepristone, political, protect, right, rights, saving, science, scotus, supreme, tell, women]"
4,"i ran for congress because i was sick of politicians telling communities they would help them, only to turn around and use them for political theater.we’ve known for years of the behavioral health and public safety crises ravaging our tribes and pueblos, and it’s time we actually listen to tribal leaders and let them lead.","[actually, behavioral, communities, congress, crises, health, help, known, lead, leaders, let, listen, political, politicians, public, pueblos, ran, ravaging, safety, sick, telling, theater, time, tribal, tribes, turn, use, years]"
...,...,...
429,we vote within,[vote]
430,as long as you’re in line by 8pm the polls must let you vote!,"[let, line, long, polls, vote]"
431,be like yama and make your voice heard!,"[heard, like, make, voice, yama]"
432,"don’t mail your ballot, drop it off at a dropbox! if you encounter any problem while voting call the mdp election protection hotline at 833-648-6837","[ballot, don, drop, dropbox, election, encounter, hotline, mail, mdp, problem, protection, voting]"


In [150]:
df_M # video descriptions of male politicians

Unnamed: 0,description,terms
0,do you know what drives me nuts about what’s going on inwith thefunding?some of my colleagues are echoingstate,"[colleagues, drives, echoingstate, going, inwith, know, nuts, thefunding]"
1,totalof my heart ❤️👨‍👩‍👧‍👦,"[heart, totalof]"
2,icymi: i dive into the momentwalked into theand’s appalling response to the speech.check it out and check back weekly for new video content.,"[appalling, check, content, dive, icymi, momentwalked, new, response, speech, theand, video, weekly]"
3,re: joe biden’s stutter,"[biden, joe, stutter]"
4,"anmessage from my mom.mom was one of the first women to be a tv “newsreader” in the 70’s. it was a time when women weren’t welcomed in the newsroom, but she helped put a big crack in that glass ceiling.","[anmessage, big, ceiling, crack, glass, helped, mom, newsreader, newsroom, time, welcomed, weren, women]"
...,...,...
908,🚨teaser alert🚨: medicare advantage is one of the largest scams in the american healthcare industry. full video drops tomorrow! (sound on),"[advantage, alert, american, drops, healthcare, industry, largest, medicare, scams, sound, teaser, tomorrow, video]"
909,step 1: strengthen medicare. step 2: medicare for all.,"[medicare, step, strengthen]"
910,"the topic of medicare advantage is deeply personal to me. in her late 80s, my mom lived in an assisted living facility. years earlier, a friendly salesperson told her about all the money she could save on prescription drugs by switching to medicare advantage. she was enamored.","[advantage, assisted, deeply, drugs, earlier, enamored, facility, friendly, late, lived, living, medicare, mom, money, personal, prescription, salesperson, save, switching, told, topic, years]"
911,question: what would you call the ex-president? here’s my response.,"[president, question, response]"


#### Fit LDA model & see topics

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [151]:
def fit_lda(df):
    dtm = to_dtm(df["description"])[0]
    lda = LatentDirichletAllocation(n_components=15, random_state=0) # pick number of topics (n_components) arbitrarely
    lda.fit(dtm)
    doc_topic_dist = lda.transform(dtm)
    return lda, doc_topic_dist

def display_topics(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([features[i]
                        for i in topic.argsort()[:-no_top_words-1:-1]])) # syntax for reversing a list [::-1]

In [155]:
display_topics(fit_lda(df_M)[0], dtm_M[1], 15) # topics for male politicians

Topic 0:
senator border texas ted cruz help right politics senate time need office join bipartisan doing
Topic 1:
ted cruz women abortion texans freedom texas reproductive congress right know house need care restore
Topic 2:
republican congress let black like today republicans house just lives way issue act history common
Topic 3:
working class country people stand got fight today republican government workers make build corporate want
Topic 4:
rep jackson jeff state speaker time trump government congress mind vote shutdown workers need change
Topic 5:
texas ted cruz election make like work health care state day let people women senator
Topic 6:
community working week today day act pay support american voting important elections stop san americans
Topic 7:
people mental access running years health thanks profits democracy care better day maga republicans making
Topic 8:
trump donald america raise gas prices openly like want weekend today antagonists conspiring october home
Topic 9:
vot

In [153]:
display_topics(fit_lda(df_F)[0], dtm_F[1], 15) # topics for female politicians

Topic 0:
patty ballot november vote day murray choice election abortion senate republicans ban pro make let
Topic 1:
just need trump big voices plan city congress time house maple vote make election day
Topic 2:
loan people new student cancellation payments guide restarting want questions facts program payment investment billion
Topic 3:
houston republicans right want control workers country year people community pass city fight ready don
Topic 4:
right human asylum borrowers seeking fed debt freedom make student support court supreme access cancel
Topic 5:
fight protect right women day like rights bodies earth life veterans tennis setting elementary cards
Topic 6:
justice congress like join act country black spending million air king members viagra cheaper chasing
Topic 7:
families art need know republicans care health away house tatum working america today voted long
Topic 8:
time love fight congress shouldn black stop style parents educators community republicans gun work let
Topic 

#### Better representation of the document-topic matrix

In [131]:
def displayHeader(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    topicNames = []
    for topic_idx, topic in enumerate(model.components_):
        topicNames.append(f"Topic {topic_idx}: " + (", ".join([features[i]
                             for i in topic.argsort()[:-no_top_words-1:-1]])))
    return topicNames

def represent_dtm(gender):
    if gender=="F":
        df, dtm = df_F, dtm_F
    elif gender=="M":
        df, dtm = df_M, dtm_M
    else:
        print("Not a valid gender")
    
    topicnames = displayHeader(fit_lda(df)[0], dtm[1], 5) # column names
    docnames = df.index.tolist() # index names (use original names of documents)
    df_document_topic = pd.DataFrame(np.round(fit_lda(df)[1], 3), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1) # finds the maximum argument
    df_document_topic['dominant_topic'] = dominant_topic

    # topic distribution across documents
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']

    return df_document_topic

represent_dtm("F")["dominant_topic"]
represent_dtm("F")

Unnamed: 0,"Topic 0: patty, ballot, november, vote, day","Topic 1: just, need, trump, big, voices","Topic 2: loan, people, new, student, cancellation","Topic 3: houston, republicans, right, want, control","Topic 4: right, human, asylum, borrowers, seeking","Topic 5: fight, protect, right, women, day","Topic 6: justice, congress, like, join, act","Topic 7: families, art, need, know, republicans","Topic 8: time, love, fight, congress, shouldn","Topic 9: people, stop, right, fighting, win","Topic 10: anti, trump, centers, democracy, years","Topic 11: justice, congress, art, debt, just","Topic 12: rights, loan, year, people, voted","Topic 13: president, biden, thank, care, movement","Topic 14: rights, mayor, week, today, city",dominant_topic
0,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.942,0.004,0.004,0.004,0.004,0.004,0.004,0.004,7
1,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.953,0.003,0.003,0.003,0.003,0.003,9
2,0.004,0.004,0.004,0.951,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,3
3,0.003,0.003,0.003,0.003,0.003,0.963,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,5
4,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.968,0.002,0.002,0.002,0.002,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,0.533,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0
430,0.011,0.011,0.011,0.844,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,3
431,0.011,0.011,0.011,0.011,0.844,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,4
432,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.928,14


In [154]:
represent_dtm("M")

Unnamed: 0,"Topic 0: senator, border, texas, ted, cruz","Topic 1: ted, cruz, women, abortion, texans","Topic 2: republican, congress, let, black, like","Topic 3: working, class, country, people, stand","Topic 4: rep, jackson, jeff, state, speaker","Topic 5: texas, ted, cruz, election, make","Topic 6: community, working, week, today, day","Topic 7: people, mental, access, running, years","Topic 8: trump, donald, america, raise, gas","Topic 9: voting, vote, congress, time, ban","Topic 10: people, need, let, new, work","Topic 11: day, year, vote, like, congress","Topic 12: want, make, matter, big, congress","Topic 13: state, jeff, jackson, sen, time","Topic 14: health, people, care, work, choose",dominant_topic
0,0.896,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0
1,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.689,0.022,0.022,0.022,0.022,0.022,0.022,8
2,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.933,0.005,0.005,12
3,0.017,0.017,0.017,0.767,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,3
4,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.942,0.004,0.004,0.004,0.004,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,0.005,0.005,0.005,0.005,0.005,0.005,0.933,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,6
909,0.011,0.844,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,1
910,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.963,0.003,0.003,0.003,0.003,0.003,0.003,0.003,7
911,0.017,0.767,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,1


### Optimizing the number of topics via grid search

In [132]:
from sklearn.model_selection import GridSearchCV

In [143]:
def grid_search(dtm):
    search_params = {'n_components': [1,2,3,4,5, 10, 15, 20, 25, 30, 35]} # num of topics to test
    lda = LatentDirichletAllocation()
    grid = GridSearchCV(lda, param_grid=search_params) # initialize a grid search with CV instance
    grid.fit(dtm[0])
    # return grid.cv_results_

    # Best Model
    best_lda_model = grid.best_estimator_
    print("Best Model's Params: ", grid.best_params_)
    print("Best Log Likelihood Score: ", grid.best_score_)
    print("Model Perplexity: ", best_lda_model.perplexity(dtm[0]))

    display_topics(best_lda_model, dtm[1], 40)
    return best_lda_model


In [142]:
grid_search(dtm_F)

Best Model's Params:  {'n_components': 1}
Best Log Likelihood Score:  -8456.370686422495
Model Perplexity:  1863.2791702320678
Topic 0:
right people republicans congress make just student health fight day need new vote loan today time like rights help let houston families community president care women house week debt justice biden city trump election future country know black year communities


In [137]:
grid_search(dtm_M)

Best Model's Params:  {'n_components': 1}
Best Log Likelihood Score:  -17981.94734867081
Model Perplexity:  2169.048723378523


In [None]:
# question: best number of topics currently is 1, but we want to have more topics (that are more focused)
# perhaps this is all grouped as "political" content? is there any way to "restrict" the boundaries of one topic?