### Topic Modeling of Video Descriptions
Last modified: 4/30/24 \
Author: Caroline Jung \
Note: this is an archived method

In [23]:
import os
import json
import pandas as pd
import numpy as np

In [24]:
dir_path = "/users/carolinejung/CS315-proj3-group2/1-data_collection/" #CHANGE ME!

#### Data Cleaning

In [28]:
def get_description(gender):
    if gender=="M":
        filepath = "output_male/"
    elif gender=="F":
        filepath = "output_female/"
    else:
        print("Not a valid input.")
    files = os.listdir(dir_path + filepath)

    all_description = []
    for file in files:
        with open(dir_path + filepath + file, 'r') as f:
            account = json.load(f)
        f.close()

        for video in account:
            vid_desc = ""
            try: 
                for parts in video["description"]:
                    vid_desc += parts.lower()
                all_description.append(vid_desc)
            except:
                pass
        
    return pd.DataFrame(data=all_description, columns=["description"]) # each row is a video description


In [26]:
df_M = get_description("M")
df_M

Unnamed: 0,description
0,do you know what drives me nuts about what’s going on inwith thefunding?some of my colleagues are echoingstate
1,totalof my heart ❤️👨‍👩‍👧‍👦
2,icymi: i dive into the momentwalked into theand’s appalling response to the speech.check it out and check back weekly for new video content.
3,re: joe biden’s stutter
4,"anmessage from my mom.mom was one of the first women to be a tv “newsreader” in the 70’s. it was a time when women weren’t welcomed in the newsroom, but she helped put a big crack in that glass ceiling."
...,...
951,🚨teaser alert🚨: medicare advantage is one of the largest scams in the american healthcare industry. full video drops tomorrow! (sound on)
952,step 1: strengthen medicare. step 2: medicare for all.
953,"the topic of medicare advantage is deeply personal to me. in her late 80s, my mom lived in an assisted living facility. years earlier, a friendly salesperson told her about all the money she could save on prescription drugs by switching to medicare advantage. she was enamored."
954,question: what would you call the ex-president? here’s my response.


In [27]:
df_F = get_description("F")
df_F

Unnamed: 0,description
0,we all know who the grand puppet master is in this impeachment. the long arm (but small hands) of donald trump and his fingerprints are all over this hearing and gop shut down.
1,"if the majority wants to talk about dark money and activist courts, i am so here for it.so, let’s talk about how right wing organizations bankroll judicial decisions to undermine our fundamental rights and judicial system."
2,"the party of ‘law and order” 🤔lol, they literally have a history of interfering in criminal investigations for political gain.today’s oversight hearing is purely a political stunt. period."
3,"no one should have the right to tell women what we can and can't do with our bodies—including the supreme court.right now scotus is considering whether or not women should have access to life saving medications like mifepristone. the law must follow the science and protect our rights, not a political agenda."
4,"i ran for congress because i was sick of politicians telling communities they would help them, only to turn around and use them for political theater.we’ve known for years of the behavioral health and public safety crises ravaging our tribes and pueblos, and it’s time we actually listen to tribal leaders and let them lead."
...,...
441,we vote within
442,as long as you’re in line by 8pm the polls must let you vote!
443,be like yama and make your voice heard!
444,"don’t mail your ballot, drop it off at a dropbox! if you encounter any problem while voting call the mdp election protection hotline at 833-648-6837"


#### Converting corpus into document-term (dtm) matrix

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
def to_dtm(words):
    vectorizer = CountVectorizer(  #initialize vectorizer
        strip_accents='unicode',
        stop_words='english',
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{3,}\b') # we want only words that contain letters and are 3 or more characters long

    # Transform our data into the document-term matrix
    dtm = vectorizer.fit_transform(words)
    feature_names = vectorizer.get_feature_names_out()
    return dtm, feature_names

In [32]:
dtm_F = to_dtm(df_F["description"])
dtm_M = to_dtm(df_M["description"])

In [33]:
def matrix2Doc(dtMatrix, features, index):
    """Turns each row of the document-term matrix into a list of terms"""
    row = dtMatrix.getrow(index).toarray()
    non_zero_indices = row.nonzero()[1]
    words = [features[idx] for idx in non_zero_indices]
    return words

def convert_to_word(df):
    dtm = to_dtm(df["description"])[0]
    feature_names = to_dtm(df["description"])[1]
    allDocsAsTerms = [matrix2Doc(dtm, feature_names, i) for i in range(dtm.shape[0])]
    return allDocsAsTerms

In [34]:
pd.set_option("display.max_colwidth",1000)
df_F["terms"] = convert_to_word(df_F)
df_M["terms"] = convert_to_word(df_M)

In [35]:
df_F # video descriptions of female politicians

Unnamed: 0,description,terms
0,we all know who the grand puppet master is in this impeachment. the long arm (but small hands) of donald trump and his fingerprints are all over this hearing and gop shut down.,"[arm, donald, fingerprints, gop, grand, hands, hearing, impeachment, know, long, master, puppet, shut, small, trump]"
1,"if the majority wants to talk about dark money and activist courts, i am so here for it.so, let’s talk about how right wing organizations bankroll judicial decisions to undermine our fundamental rights and judicial system.","[activist, bankroll, courts, dark, decisions, fundamental, judicial, let, majority, money, organizations, right, rights, talk, undermine, wants, wing]"
2,"the party of ‘law and order” 🤔lol, they literally have a history of interfering in criminal investigations for political gain.today’s oversight hearing is purely a political stunt. period.","[criminal, gain, hearing, history, interfering, investigations, law, literally, lol, order, oversight, party, period, political, purely, stunt, today]"
3,"no one should have the right to tell women what we can and can't do with our bodies—including the supreme court.right now scotus is considering whether or not women should have access to life saving medications like mifepristone. the law must follow the science and protect our rights, not a political agenda.","[access, agenda, bodies, considering, court, follow, including, law, life, like, medications, mifepristone, political, protect, right, rights, saving, science, scotus, supreme, tell, women]"
4,"i ran for congress because i was sick of politicians telling communities they would help them, only to turn around and use them for political theater.we’ve known for years of the behavioral health and public safety crises ravaging our tribes and pueblos, and it’s time we actually listen to tribal leaders and let them lead.","[actually, behavioral, communities, congress, crises, health, help, known, lead, leaders, let, listen, political, politicians, public, pueblos, ran, ravaging, safety, sick, telling, theater, time, tribal, tribes, turn, use, years]"
...,...,...
441,we vote within,[vote]
442,as long as you’re in line by 8pm the polls must let you vote!,"[let, line, long, polls, vote]"
443,be like yama and make your voice heard!,"[heard, like, make, voice, yama]"
444,"don’t mail your ballot, drop it off at a dropbox! if you encounter any problem while voting call the mdp election protection hotline at 833-648-6837","[ballot, don, drop, dropbox, election, encounter, hotline, mail, mdp, problem, protection, voting]"


In [36]:
df_M # video descriptions of male politicians

Unnamed: 0,description,terms
0,do you know what drives me nuts about what’s going on inwith thefunding?some of my colleagues are echoingstate,"[colleagues, drives, echoingstate, going, inwith, know, nuts, thefunding]"
1,totalof my heart ❤️👨‍👩‍👧‍👦,"[heart, totalof]"
2,icymi: i dive into the momentwalked into theand’s appalling response to the speech.check it out and check back weekly for new video content.,"[appalling, check, content, dive, icymi, momentwalked, new, response, speech, theand, video, weekly]"
3,re: joe biden’s stutter,"[biden, joe, stutter]"
4,"anmessage from my mom.mom was one of the first women to be a tv “newsreader” in the 70’s. it was a time when women weren’t welcomed in the newsroom, but she helped put a big crack in that glass ceiling.","[anmessage, big, ceiling, crack, glass, helped, mom, newsreader, newsroom, time, welcomed, weren, women]"
...,...,...
951,🚨teaser alert🚨: medicare advantage is one of the largest scams in the american healthcare industry. full video drops tomorrow! (sound on),"[advantage, alert, american, drops, healthcare, industry, largest, medicare, scams, sound, teaser, tomorrow, video]"
952,step 1: strengthen medicare. step 2: medicare for all.,"[medicare, step, strengthen]"
953,"the topic of medicare advantage is deeply personal to me. in her late 80s, my mom lived in an assisted living facility. years earlier, a friendly salesperson told her about all the money she could save on prescription drugs by switching to medicare advantage. she was enamored.","[advantage, assisted, deeply, drugs, earlier, enamored, facility, friendly, late, lived, living, medicare, mom, money, personal, prescription, salesperson, save, switching, told, topic, years]"
954,question: what would you call the ex-president? here’s my response.,"[president, question, response]"


#### Fit LDA model & see topics

In [17]:
from sklearn.decomposition import LatentDirichletAllocation

In [39]:
def fit_lda(df):
    dtm = to_dtm(df["description"])[0]
    lda = LatentDirichletAllocation(n_components=15, random_state=0) # pick number of topics (n_components) arbitrarely
    lda.fit(dtm)
    doc_topic_dist = lda.transform(dtm)
    return lda, doc_topic_dist

def display_topics(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([features[i]
                        for i in topic.argsort()[:-no_top_words-1:-1]])) # syntax for reversing a list [::-1]

In [40]:
display_topics(fit_lda(df_M)[0], dtm_M[1], 15) # topics for male politicians

Topic 0:
let like ted cruz texans senate campaign election time senator republican trump bring vote state
Topic 1:
help need doing border people work health american time crisis government valley care million right
Topic 2:
special workers congress medicare care climate gerrymandering freedom democracy years sure change just health people
Topic 3:
need know gas congress prices like housing affordable texas people working ted cruz new support
Topic 4:
state texas vote new people ready just texans work right job year working make fight
Topic 5:
abortion ted cruz women texas rights country state ban want vote know join freedom restore
Topic 6:
congress deserve year social security right country today need stories life republicans workers know senator
Topic 7:
republican congress working day district jobs country week year continue families elected election days women
Topic 8:
working going class just early today vote voting people started day voters community let things
Topic 9:
jackson j

In [41]:
display_topics(fit_lda(df_F)[0], dtm_F[1], 15) # topics for female politicians

Topic 0:
women rights right payments bodies restarting life workers lot fight questions like bills protect hundreds
Topic 1:
day people control america want like fed women houston help right mayor doesn congress share
Topic 2:
loan need families student voted save health dollars program new country sign voting millions folks
Topic 3:
house people access help state art make minnesota women questions student food war prices new
Topic 4:
justice just right debt people trump make rights community better week borrowers years student economy
Topic 5:
fight know anti let congress year ready new talk workers sign day woman stand right
Topic 6:
republicans people support let week fighting happy having year away kids school health supporting congress
Topic 7:
vote houston ballot democracy just november make republicans justice don time market members let service
Topic 8:
art win come congress children act freedom earth setting healthy amazon repubicans govt wind time
Topic 9:
loan student cancel

#### Better representation of the document-topic matrix

In [43]:
def displayHeader(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    topicNames = []
    for topic_idx, topic in enumerate(model.components_):
        topicNames.append(f"Topic {topic_idx}: " + (", ".join([features[i]
                             for i in topic.argsort()[:-no_top_words-1:-1]])))
    return topicNames

def represent_dtm(gender):
    if gender=="F":
        df, dtm = df_F, dtm_F
    elif gender=="M":
        df, dtm = df_M, dtm_M
    else:
        print("Not a valid gender")
    
    topicnames = displayHeader(fit_lda(df)[0], dtm[1], 5) # column names
    docnames = df.index.tolist() # index names (use original names of documents)
    df_document_topic = pd.DataFrame(np.round(fit_lda(df)[1], 3), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1) # finds the maximum argument
    df_document_topic['dominant_topic'] = dominant_topic

    # topic distribution across documents
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']

    return df_document_topic

In [44]:
represent_dtm("F")

Unnamed: 0,"Topic 0: women, rights, right, payments, bodies","Topic 1: day, people, control, america, want","Topic 2: loan, need, families, student, voted","Topic 3: house, people, access, help, state","Topic 4: justice, just, right, debt, people","Topic 5: fight, know, anti, let, congress","Topic 6: republicans, people, support, let, week","Topic 7: vote, houston, ballot, democracy, just","Topic 8: art, win, come, congress, children","Topic 9: loan, student, cancellation, document, president","Topic 10: new, black, congress, work, fight","Topic 11: need, care, election, today, health","Topic 12: right, human, asylum, seeking, know","Topic 13: people, time, republicans, police, voices","Topic 14: patty, murray, right, community, future",dominant_topic
0,0.004,0.004,0.004,0.004,0.004,0.942,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,5
1,0.003,0.003,0.003,0.003,0.003,0.953,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,5
2,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.951,0.004,0.004,0.004,0.004,0.004,0.004,0.004,7
3,0.963,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0
4,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.968,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,0.033,0.033,0.033,0.033,0.033,0.033,0.033,0.533,0.033,0.033,0.033,0.033,0.033,0.033,0.033,7
442,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.844,0.011,0.011,0.011,0.011,0.011,0.011,0.011,7
443,0.011,0.011,0.011,0.844,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,3
444,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.928,14


In [45]:
represent_dtm("M")

Unnamed: 0,"Topic 0: let, like, ted, cruz, texans","Topic 1: help, need, doing, border, people","Topic 2: special, workers, congress, medicare, care","Topic 3: need, know, gas, congress, prices","Topic 4: state, texas, vote, new, people","Topic 5: abortion, ted, cruz, women, texas","Topic 6: congress, deserve, year, social, security","Topic 7: republican, congress, working, day, district","Topic 8: working, going, class, just, early","Topic 9: jackson, jeff, rep, sen, state","Topic 10: texas, make, care, working, health","Topic 11: want, trump, republicans, house, people","Topic 12: great, day, today, congress, work","Topic 13: texas, like, need, school, people","Topic 14: people, republicans, care, fighting, need",dominant_topic
0,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.007,0.896,0.007,0.007,0.007,0.007,0.007,0.007,8
1,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.689,14
2,0.005,0.005,0.005,0.005,0.005,0.933,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,5
3,0.017,0.017,0.017,0.017,0.017,0.767,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,5
4,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.942,0.004,0.004,0.004,0.004,0.004,0.004,0.004,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.933,0.005,0.005,0.005,11
952,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.844,14
953,0.003,0.003,0.963,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003,2
954,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.767,0.017,0.017,0.017,0.017,0.017,0.017,0.017,7


### Optimizing the number of topics via grid search

In [49]:
from sklearn.model_selection import GridSearchCV

In [47]:
def grid_search(dtm):
    search_params = {'n_components': [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35]} # num of topics to test
    lda = LatentDirichletAllocation()
    grid = GridSearchCV(lda, param_grid=search_params) # initialize a grid search with CV instance
    grid.fit(dtm[0])
    # return grid.cv_results_

    # Best Model
    best_lda_model = grid.best_estimator_
    print("Best Model's Params: ", grid.best_params_)
    print("Best Log Likelihood Score: ", grid.best_score_)
    print("Model Perplexity: ", best_lda_model.perplexity(dtm[0]))

    display_topics(best_lda_model, dtm[1], 40)
    return best_lda_model

In [50]:
grid_search(dtm_F)

Best Model's Params:  {'n_components': 1}
Best Log Likelihood Score:  -8708.01886947514
Model Perplexity:  1875.923959667552
Topic 0:
right people congress republicans make day fight just health student need vote new like today rights loan time help women families let houston community president care know week house election debt justice biden year city future trump country black want


In [137]:
grid_search(dtm_M)

Best Model's Params:  {'n_components': 1}
Best Log Likelihood Score:  -17981.94734867081
Model Perplexity:  2169.048723378523


Best number of topics=1, thus, we do not have enough data to gain an appropriate analysis and we archive this method.