## 07-topic-model-review

**Purpose**: Evaluate topic models by reviewing samples of docs per topic.

In [1]:
import os
import pandas as pd

In [2]:
from usrightmedia.shared.topics_utils import *

In [3]:
from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '07-topic-model-review', logger_type='main')

### 1. Load topic models' summary and documents

In [4]:
def load(model):
    df_smry = pd.read_pickle(os.path.join(MODELS_DIR, model, f"{model}_top_topic_summary.pkl"))
    df_docs = pd.read_pickle(os.path.join(MODELS_DIR, model, f"{model}_top_topic_with_ids.pkl"))
    
    # fix percentages
    return df_smry, df_docs

In [5]:
dfs_10, dfd_10 = load('lda_corpus_tfidf_docs_texts_topics_10')
dfs_25, dfd_25 = load('lda_corpus_tfidf_docs_texts_topics_25')
dfs_40, dfd_40 = load('lda_corpus_tfidf_docs_texts_topics_40')

In [6]:
with pd.option_context('display.max_colwidth', None):
    display(dfs_10)
    display(dfs_25)
    display(dfs_40)

Unnamed: 0,top_topic,topic_tokens,count,pct_total
0,0.0,"police, people, man, white, officer, black, attack, military, country, gun",307991,42.325933
1,1.0,"coronavirus, virus, health, game, pandemic, vaccine, state, case, mask, team",143890,19.774209
2,2.0,"investigation, russian, election, email, campaign, information, court, president, report, impeachment",97986,13.465812
3,3.0,"voter, campaign, candidate, election, poll, democratic, president, republican, presidential, vote",108461,14.905348
4,4.0,"market, company, bank, trade, oil, economy, file, price, stock, investor",53026,7.287144
5,5.0,"post, slideshow, post_post, royal, dress, wedding, photo, birthday, native, look",1093,0.150206
6,6.0,"immigration, border, illegal, immigrant, wall, migrant, alien, illegal_immigrant, illegal_alien, bill",12777,1.75589
7,7.0,"cigarette, e_cigarette, tobacco, e, smoking, smoker, vaping, marijuana, nicotine, product",245,0.033669
8,8.0,"storm, hurricane, wind, fire, rain, live, flooding, wildfire, weather, water",2192,0.301238
9,9.0,"cartel, mexican, drug_cartel, drug, gunman, cartel_gunman, drug_trafficking, mexican_cartel, faction, chronicle",4,0.00055


Unnamed: 0,top_topic,topic_tokens,count,pct_total
0,0.0,"rush, people, white, country, american, right, thing, way, black, world",126264,17.351941
1,1.0,"star, post, fan, film, family, time, photo, movie, good, match",105710,14.527289
2,2.0,"investigation, email, russian, campaign, president, information, impeachment, election, report, intelligence",89871,12.350601
3,3.0,"student, school, sexual, woman, allegation, prison, child, sex, court, assault",42861,5.89021
4,4.0,"coronavirus, mask, virus, outbreak, pandemic, file_photo, case, state, disease, health",51901,7.13254
5,5.0,"medium, white, tweet, black, speech, president, people, racist, video, conservative",69844,9.598373
6,6.0,"bill, border, wall, migrant, legislation, funding, immigration, insurance, budget, plan",31489,4.327403
7,7.0,"military, attack, force, terrorist, troop, government, syrian, war, israeli, minister",37392,5.138628
8,8.0,"voter, poll, candidate, election, campaign, democratic, presidential, vote, republican, primary",46016,6.323789
9,9.0,"vaccine, virus, drug, coronavirus, patient, study, health, disease, death, doctor",12966,1.781864


Unnamed: 0,top_topic,topic_tokens,count,pct_total
0,0.0,"white, people, rush, black, thing, right, woman, american, medium, way",151975,20.885297
1,1.0,"game, team, player, fan, season, star, time, home, family, good",128025,17.593948
2,2.0,"company, agency, federal, state, administration, energy, government, technology, system, coal",45414,6.241059
3,3.0,"sexual, prison, court, woman, police, allegation, charge, prosecutor, case, assault",48538,6.670377
4,4.0,"market, company, bank, economy, rate, coronavirus, stock, price, file, file_photo",59091,8.120632
5,5.0,"campaign, candidate, democratic, presidential, election, republican, nominee, voter, primary, president",62932,8.648485
6,6.0,"investigation, impeachment, email, russian, president, campaign, counsel, information, report, election",53789,7.392
7,7.0,"military, attack, force, terrorist, troop, war, iranian, syrian, government, security",42484,5.838401
8,8.0,"user, company, medium, social_medium, platform, app, social, account, content, ad",20344,2.795792
9,9.0,"bill, tax, legislation, budget, spending, plan, healthcare, insurance, reform, taxis",17153,2.357266


### 

In [7]:
dfd_10

Unnamed: 0,top_topic,top_topic_pct,topic_tokens,doc_tokens,doc_id
0,0.0,91.110001,"police, people, man, white, officer, black, at...","[federal, government, study, reparation, desce...",AmericanRenaissance_1128638341
1,0.0,88.419998,"police, people, man, white, officer, black, at...","[portrait, break, county, headquarters, vandal...",Breitbart_621129461
2,1.0,69.800003,"coronavirus, virus, health, game, pandemic, va...","[death, sharply, american, community, multinat...",Breitbart_1483020896
3,1.0,58.980000,"coronavirus, virus, health, game, pandemic, va...","[ride, giant, courier, service, lawsuit, brake...",Breitbart_1483567174
4,2.0,63.770000,"investigation, russian, election, email, campa...","[dark, money, network, life, nearly, taxpayer,...",AmericanRenaissance_1812166693
...,...,...,...,...,...
727660,0.0,56.650002,"police, people, man, white, officer, black, at...","[host, border, official, appalling, federal, l...",WashingtonExaminer_999923116
727661,0.0,94.099998,"police, people, man, white, officer, black, at...","[evening, struggle, civil, right, history, tim...",WashingtonExaminer_999923435
727662,2.0,56.059998,"investigation, russian, election, email, campa...","[cohost, recently, commander, chief, pornograp...",WashingtonExaminer_999951831
727663,2.0,90.060005,"investigation, russian, election, email, campa...","[federal, judge, person, special, counsel, inv...",WashingtonExaminer_999952161
