### Topic Modeling of Comments

In [8]:
import os
import json
import pandas as pd
import numpy as np

In [11]:
dir_path = "/users/carolinejung/CS315-proj3-group2/1-data_collection/" #CHANGE ME!

#### Data Cleaning

In [12]:
def get_description(gender):
    if gender=="M":
        filepath = "output_male/"
    elif gender=="F":
        filepath = "output_female/"
    else:
        print("Not a valid input.")
    files = os.listdir(dir_path + filepath)

    #all_comments = []
    all_description = []
    for file in files:
        with open(dir_path + filepath + file, 'r') as f:
            account = json.load(f)
        f.close()

        for video in account:
            vid_desc = ""
            try: 
                for parts in video["description"]:
                    vid_desc += parts.lower()
                all_description.append(vid_desc)
            except:
                pass

            # for comment in video["comments"]:
            #     all_comments.append(comment.lower())
        
    return pd.DataFrame(data=all_description, columns=["description"]) #, all_comments

df_M = get_description("M")
df_F = get_description("F")

print(df_F["description"]) #each row is a video

# possible future works: try it with comments?

0      we all know who the grand puppet master is in ...
1      if the majority wants to talk about dark money...
2      the party of ‚Äòlaw and order‚Äù ü§îlol, they litera...
3      no one should have the right to tell women wha...
4      i ran for congress because i was sick of polit...
                             ...                        
441                                       we vote within
442    as long as you‚Äôre in line by 8pm the polls mus...
443              be like yama and make your voice heard!
444    don‚Äôt mail your ballot, drop it off at a dropb...
445    be sure to fill out your entire ballot! if you...
Name: description, Length: 446, dtype: object


#### Converting corpus into document-term (dtm) matrix

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
def to_dtm(words):
    vectorizer = CountVectorizer(  #initialize vectorizer
        strip_accents='unicode',
        stop_words='english',
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{3,}\b') # we want only words that contain letters and are 3 or more characters long

    # Transform our data into the document-term matrix
    dtm = vectorizer.fit_transform(words)
    feature_names = vectorizer.get_feature_names_out()
    return dtm, feature_names

dtm_F = to_dtm(df_F["description"])
dtm_M = to_dtm(df_M["description"])

In [15]:
def matrix2Doc(dtMatrix, features, index):
    """Turns each row of the document-term matrix into a list of terms"""
    row = dtMatrix.getrow(index).toarray()
    non_zero_indices = row.nonzero()[1]
    words = [features[idx] for idx in non_zero_indices]
    return words

def convert_to_word(df):
    dtm = to_dtm(df["description"])[0]
    feature_names = to_dtm(df["description"])[1]
    allDocsAsTerms = [matrix2Doc(dtm, feature_names, i) for i in range(dtm.shape[0])]
    return allDocsAsTerms

pd.set_option("display.max_colwidth",1000)
df_F["terms"] = convert_to_word(df_F)
df_M["terms"] = convert_to_word(df_M)

In [16]:
df_F # video descriptions of female politicians

Unnamed: 0,description,terms
0,we all know who the grand puppet master is in this impeachment. the long arm (but small hands) of donald trump and his fingerprints are all over this hearing and gop shut down.,"[arm, donald, fingerprints, gop, grand, hands, hearing, impeachment, know, long, master, puppet, shut, small, trump]"
1,"if the majority wants to talk about dark money and activist courts, i am so here for it.so, let‚Äôs talk about how right wing organizations bankroll judicial decisions to undermine our fundamental rights and judicial system.","[activist, bankroll, courts, dark, decisions, fundamental, judicial, let, majority, money, organizations, right, rights, talk, undermine, wants, wing]"
2,"the party of ‚Äòlaw and order‚Äù ü§îlol, they literally have a history of interfering in criminal investigations for political gain.today‚Äôs oversight hearing is purely a political stunt. period.","[criminal, gain, hearing, history, interfering, investigations, law, literally, lol, order, oversight, party, period, political, purely, stunt, today]"
3,"no one should have the right to tell women what we can and can't do with our bodies‚Äîincluding the supreme court.right now scotus is considering whether or not women should have access to life saving medications like mifepristone. the law must follow the science and protect our rights, not a political agenda.","[access, agenda, bodies, considering, court, follow, including, law, life, like, medications, mifepristone, political, protect, right, rights, saving, science, scotus, supreme, tell, women]"
4,"i ran for congress because i was sick of politicians telling communities they would help them, only to turn around and use them for political theater.we‚Äôve known for years of the behavioral health and public safety crises ravaging our tribes and pueblos, and it‚Äôs time we actually listen to tribal leaders and let them lead.","[actually, behavioral, communities, congress, crises, health, help, known, lead, leaders, let, listen, political, politicians, public, pueblos, ran, ravaging, safety, sick, telling, theater, time, tribal, tribes, turn, use, years]"
...,...,...
441,we vote within,[vote]
442,as long as you‚Äôre in line by 8pm the polls must let you vote!,"[let, line, long, polls, vote]"
443,be like yama and make your voice heard!,"[heard, like, make, voice, yama]"
444,"don‚Äôt mail your ballot, drop it off at a dropbox! if you encounter any problem while voting call the mdp election protection hotline at 833-648-6837","[ballot, don, drop, dropbox, election, encounter, hotline, mail, mdp, problem, protection, voting]"


In [None]:
df_M # video descriptions of male politicians

Unnamed: 0,description,terms
0,do you know what drives me nuts about what‚Äôs going on inwith thefunding?some of my colleagues are echoingstate,"[colleagues, drives, echoingstate, going, inwith, know, nuts, thefunding]"
1,totalof my heart ‚ù§Ô∏èüë®‚Äçüë©‚Äçüëß‚Äçüë¶,"[heart, totalof]"
2,icymi: i dive into the momentwalked into theand‚Äôs appalling response to the speech.check it out and check back weekly for new video content.,"[appalling, check, content, dive, icymi, momentwalked, new, response, speech, theand, video, weekly]"
3,re: joe biden‚Äôs stutter,"[biden, joe, stutter]"
4,"anmessage from my mom.mom was one of the first women to be a tv ‚Äúnewsreader‚Äù in the 70‚Äôs. it was a time when women weren‚Äôt welcomed in the newsroom, but she helped put a big crack in that glass ceiling.","[anmessage, big, ceiling, crack, glass, helped, mom, newsreader, newsroom, time, welcomed, weren, women]"
...,...,...
908,üö®teaser alertüö®: medicare advantage is one of the largest scams in the american healthcare industry. full video drops tomorrow! (sound on),"[advantage, alert, american, drops, healthcare, industry, largest, medicare, scams, sound, teaser, tomorrow, video]"
909,step 1: strengthen medicare. step 2: medicare for all.,"[medicare, step, strengthen]"
910,"the topic of medicare advantage is deeply personal to me. in her late 80s, my mom lived in an assisted living facility. years earlier, a friendly salesperson told her about all the money she could save on prescription drugs by switching to medicare advantage. she was enamored.","[advantage, assisted, deeply, drugs, earlier, enamored, facility, friendly, late, lived, living, medicare, mom, money, personal, prescription, salesperson, save, switching, told, topic, years]"
911,question: what would you call the ex-president? here‚Äôs my response.,"[president, question, response]"


#### Fit LDA model & see topics

In [17]:
from sklearn.decomposition import LatentDirichletAllocation

In [18]:
def fit_lda(df):
    dtm = to_dtm(df["description"])[0]
    lda = LatentDirichletAllocation(n_components=10, random_state=0) # pick number of topics (n_components) arbitrarely
    lda.fit(dtm)
    doc_topic_dist = lda.transform(dtm)
    return lda, doc_topic_dist

def display_topics(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([features[i]
                        for i in topic.argsort()[:-no_top_words-1:-1]])) # syntax for reversing a list [::-1]

In [19]:
display_topics(fit_lda(df_M)[0], dtm_M[1], 10) # topics for male politicians

Topic 0:
let like texans cruz ted senate election vote state ready
Topic 1:
people health care need work profits great making right today
Topic 2:
congress time people day student fighting need american just biden
Topic 3:
ted cruz texas abortion women texans time know like ban
Topic 4:
state congress texas vote workers fight make ballot year stand
Topic 5:
people make want making country money congress medicare working care
Topic 6:
today republicans people congress day year senator time house need
Topic 7:
country day working just republican district congress republicans care year
Topic 8:
working class today people community families make valley country lives
Topic 9:
jackson jeff rep sen mental health vote crisis access just


In [20]:
display_topics(fit_lda(df_F)[0], dtm_F[1], 10) # topics for female politicians

Topic 0:
rights away life school families payments restarting lot having questions
Topic 1:
day control right fed like help children republicans doesn care
Topic 2:
families loan year voted student congress ballot new house mayor
Topic 3:
right women fight working human house families access asylum vote
Topic 4:
right health justice make change congress community just years debt
Topic 5:
fight congress know new year anti trump loan cancellation student
Topic 6:
people police time let today loan student right need debt
Topic 7:
houston just city future job make thank november people freedom
Topic 8:
workers art country win community time care republican union history
Topic 9:
patty republicans president time ban make people want state know


#### Better representation of the document-topic matrix

In [21]:
def displayHeader(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    topicNames = []
    for topic_idx, topic in enumerate(model.components_):
        topicNames.append(f"Topic {topic_idx}: " + (", ".join([features[i]
                             for i in topic.argsort()[:-no_top_words-1:-1]])))
    return topicNames

def represent_dtm(gender):
    if gender=="F":
        df, dtm = df_F, dtm_F
    elif gender=="M":
        df, dtm = df_M, dtm_M
    else:
        print("Not a valid gender")
    
    topicnames = displayHeader(fit_lda(df)[0], dtm[1], 5) # column names
    docnames = df.index.tolist() # index names (use original names of documents)
    df_document_topic = pd.DataFrame(np.round(fit_lda(df)[1], 3), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1) # finds the maximum argument
    df_document_topic['dominant_topic'] = dominant_topic

    # topic distribution across documents
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']

    return df_document_topic

represent_dtm("F")["dominant_topic"]
represent_dtm("F")

Unnamed: 0,"Topic 0: rights, away, life, school, families","Topic 1: day, control, right, fed, like","Topic 2: families, loan, year, voted, student","Topic 3: right, women, fight, working, human","Topic 4: right, health, justice, make, change","Topic 5: fight, congress, know, new, year","Topic 6: people, police, time, let, today","Topic 7: houston, just, city, future, job","Topic 8: workers, art, country, win, community","Topic 9: patty, republicans, president, time, ban",dominant_topic
0,0.006,0.006,0.006,0.006,0.006,0.944,0.006,0.006,0.006,0.006,5
1,0.005,0.005,0.005,0.005,0.005,0.005,0.955,0.005,0.005,0.005,6
2,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.953,0.005,0.005,7
3,0.004,0.004,0.004,0.964,0.004,0.004,0.004,0.004,0.004,0.004,3
4,0.003,0.003,0.003,0.003,0.003,0.003,0.969,0.003,0.003,0.003,6
...,...,...,...,...,...,...,...,...,...,...,...
441,0.050,0.050,0.050,0.550,0.050,0.050,0.050,0.050,0.050,0.050,3
442,0.017,0.017,0.017,0.850,0.017,0.017,0.017,0.017,0.017,0.017,3
443,0.850,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0.017,0
444,0.008,0.008,0.008,0.008,0.008,0.008,0.008,0.931,0.008,0.008,7


In [22]:
represent_dtm("M")

Unnamed: 0,"Topic 0: let, like, texans, cruz, ted","Topic 1: people, health, care, need, work","Topic 2: congress, time, people, day, student","Topic 3: ted, cruz, texas, abortion, women","Topic 4: state, congress, texas, vote, workers","Topic 5: people, make, want, making, country","Topic 6: today, republicans, people, congress, day","Topic 7: country, day, working, just, republican","Topic 8: working, class, today, people, community","Topic 9: jackson, jeff, rep, sen, mental",dominant_topic
0,0.011,0.900,0.011,0.011,0.011,0.011,0.011,0.011,0.011,0.011,1
1,0.033,0.033,0.033,0.033,0.033,0.700,0.033,0.033,0.033,0.033,5
2,0.007,0.007,0.007,0.007,0.007,0.936,0.007,0.007,0.007,0.007,5
3,0.025,0.025,0.025,0.025,0.025,0.025,0.775,0.025,0.025,0.025,6
4,0.006,0.006,0.006,0.006,0.006,0.006,0.944,0.006,0.006,0.006,6
...,...,...,...,...,...,...,...,...,...,...,...
951,0.007,0.007,0.007,0.007,0.007,0.936,0.007,0.007,0.007,0.007,5
952,0.017,0.017,0.017,0.017,0.017,0.850,0.017,0.017,0.017,0.017,5
953,0.004,0.004,0.964,0.004,0.004,0.004,0.004,0.004,0.004,0.004,2
954,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.775,0.025,0.025,7


### Optimizing the number of topics via grid search

In [132]:
from sklearn.model_selection import GridSearchCV

In [143]:
def grid_search(dtm):
    search_params = {'n_components': [1,2,3,4,5, 10, 15, 20, 25, 30, 35]} # num of topics to test
    lda = LatentDirichletAllocation()
    grid = GridSearchCV(lda, param_grid=search_params) # initialize a grid search with CV instance
    grid.fit(dtm[0])
    # return grid.cv_results_

    # Best Model
    best_lda_model = grid.best_estimator_
    print("Best Model's Params: ", grid.best_params_)
    print("Best Log Likelihood Score: ", grid.best_score_)
    print("Model Perplexity: ", best_lda_model.perplexity(dtm[0]))

    display_topics(best_lda_model, dtm[1], 40)
    return best_lda_model


In [142]:
grid_search(dtm_F)

Best Model's Params:  {'n_components': 1}
Best Log Likelihood Score:  -8456.370686422495
Model Perplexity:  1863.2791702320678
Topic 0:
right people republicans congress make just student health fight day need new vote loan today time like rights help let houston families community president care women house week debt justice biden city trump election future country know black year communities


In [137]:
grid_search(dtm_M)

Best Model's Params:  {'n_components': 1}
Best Log Likelihood Score:  -17981.94734867081
Model Perplexity:  2169.048723378523


In [None]:
# question: best number of topics currently is 1, but we want to have more topics (that are more focused)
# perhaps this is all grouped as "political" content? is there any way to "restrict" the boundaries of one topic?