In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors

In [2]:
df_2020 = pd.read_csv("D:\\Github_Repos\\dsa4264\\data\\topics_2020.csv")
df_2021 = pd.read_csv("D:\\Github_Repos\\dsa4264\\data\\topics_2021.csv")
df_2022 = pd.read_csv("D:\\Github_Repos\\dsa4264\\data\\topics_2022.csv")
df_2023 = pd.read_csv("D:\\Github_Repos\\dsa4264\\data\\topics_2023.csv")

In [3]:
df_2020.head()

Unnamed: 0,text,yearmonth,title,index,average_toxicity_score,Topic,Count,Name,Representation,Representative_Docs
0,SUTD entry requirements is more or less the sa...,2020-01,sit suss or sutd,0,0.043735,29,213,29_ns_nsf_nsmen_nsfs,"['ns', 'nsf', 'nsmen', 'nsfs', 'nsrelated', 'n...","['So...what is ns....', 'ns', 'Ns']"
1,"With jam like this, it's barely worth it.",2020-01,how congested the ecp is on new years day,1,0.034306,132,64,132_pineapples_pineapple_pineappless_pineapple...,"['pineapples', 'pineapple', 'pineappless', 'pi...","['pineapple tarts are love, pineapple tarts ar..."
2,"I outside now, I guess reaching there around 0...",2020-01,whats the earliest timing i can visit the guan,2,0.077415,343,27,343_slow_slower_speed_faster,"['slow', 'slower', 'speed', 'faster', '24mph',...",['So slow reaction time???.cfm not paying atte...
3,The female or male cry?,2020-01,rsingapore random discussion and small questions,3,0.029896,303,30,303_cry_crying_cried_cries,"['cry', 'crying', 'cried', 'cries', 'tears', '...","[""You should be more worried if she doesn't cr..."
4,bring an umbrella! it gets really hot during t...,2020-01,what is there to prepare for a trip to st johns,4,0.042127,-1,35897,-1_being_really_than_about,"['being', 'really', 'than', 'about', 'im', 'wi...","[""It is good that you care about your mum but ..."


In [4]:
def extract_topic_features(dfs_dict):
    """
    Modified function to include toxicity score aggregation
    """
    topic_data = []
    
    for year, df in dfs_dict.items():
        print(f"Processing {year}...")
        for _, row in df.iterrows():
            try:
                # Handle keywords
                if isinstance(row['Name'], str):
                    if row['Name'].startswith('[') and row['Name'].endswith(']'):
                        keywords = eval(row['Name'])
                    else:
                        keywords = row['Name'].split('_')[1:]
                else:
                    continue
                
                # Handle Representative_Docs
                if isinstance(row['Representative_Docs'], str):
                    rep_docs = eval(row['Representative_Docs'])
                else:
                    rep_docs = row['Representative_Docs'] if isinstance(row['Representative_Docs'], list) else []
                
                keyword_str = ' '.join([str(k) for k in keywords if k and str(k).strip()])
                
                if keyword_str:
                    topic_data.append({
                        'year': year,
                        'yearmonth': row['yearmonth'],
                        'topic_id': row['Topic'],
                        'keywords': keyword_str,
                        'representative_docs': rep_docs,
                        'count': row.get('Count', 0),
                        'toxicity_score': row['average_toxicity_score']
                    })
            except Exception as e:
                print(f"Error processing row in {year}: {e}")
                continue
    
    return pd.DataFrame(topic_data)

In [5]:
# Create dataset
dfs_dict = {
    '2020': df_2020,
    '2021': df_2021,
    '2022': df_2022,
    '2023': df_2023
}

# Print initial stats
print("\nInitial Dataset Statistics:")
for year, df in dfs_dict.items():
    print(f"{year}: {len(df)} rows, {df['Topic'].nunique()} unique topics")

topics_df = extract_topic_features(dfs_dict)
print(f"\nExtracted {len(topics_df)} topic entries")


Initial Dataset Statistics:
2020: 1302526 rows, 1126 unique topics
2021: 1361253 rows, 1329 unique topics
2022: 1042831 rows, 1074 unique topics
2023: 797719 rows, 1002 unique topics
Processing 2020...
Processing 2021...
Processing 2022...
Processing 2023...

Extracted 4501485 topic entries


In [6]:
topics_df.head()

Unnamed: 0,year,yearmonth,topic_id,keywords,representative_docs,count,toxicity_score
0,2020,2020-01,29,ns nsf nsmen nsfs,"[So...what is ns...., ns, Ns]",213,0.043735
1,2020,2020-01,132,pineapples pineapple pineappless pineappleglazed,"[pineapple tarts are love, pineapple tarts are...",64,0.034306
2,2020,2020-01,343,slow slower speed faster,[So slow reaction time???.cfm not paying atten...,27,0.077415
3,2020,2020-01,303,cry crying cried cries,[You should be more worried if she doesn't cry...,30,0.029896
4,2020,2020-01,-1,being really than about,[It is good that you care about your mum but p...,35897,0.042127


In [7]:
def clean_topic_keywords(keywords_str):
    # Remove common stopwords and non-meaningful terms
    stopwords = {'than', 'being', 'how', 'will', 'what', 'why', 'when', 'who', 'where', 'about', 'really', 'very', 'just'}
    words = [w for w in keywords_str.split() if w.lower() not in stopwords and len(w) > 2]
    return ' '.join(words)

def get_meaningful_topics(topics_df, min_count=100, top_n_per_year=500):
    print("Processing meaningful topics...")
    
    # First, clean keywords
    topics_df['clean_keywords'] = topics_df['keywords'].apply(clean_topic_keywords)
    
    # Remove topics with empty or non-meaningful keywords
    filtered = topics_df[
        (topics_df['clean_keywords'].str.strip() != '') & 
        (topics_df['count'] >= min_count)
    ]
    print(f"After initial filtering: {len(filtered)} topics")
    
    # Remove duplicates based on year and clean_keywords
    filtered = filtered.drop_duplicates(subset=['year', 'clean_keywords'])
    print(f"After removing duplicates: {len(filtered)} topics")
    
    # Take top N most representative topics per year
    significant = filtered.groupby('year').apply(
        lambda x: x.nlargest(top_n_per_year, 'count')
    ).reset_index(drop=True)
    
    print(f"Final number of significant topics: {len(significant)}")
    
    # Print some statistics
    print("\nTopics per year:")
    print(significant.groupby('year').size())
    
    return significant


In [8]:
def preprocess_topics(topics_df):
    # More comprehensive stopwords
    generic_terms = {
        'singapore', 'singaporean', 'singaporeans', 'because', 'than', 'being', 'about',
        'really', 'very', 'just', 'like', 'get', 'got', 'much', 'many', 'more', 'also',
        'the', 'this', 'that', 'these', 'those', 'there', 'here', 'what', 'when', 'where',
        'who', 'why', 'how', 'which', 'way', 'not', 'can', 'will', 'would', 'should','her','she',
        'they', 'thank', 'thanks', 'thankyou', 'please', 'plz', 'help', 'need', 'want', 'wish',
        'yes','yess','nope'
    }
    
    def clean_keywords(text):
        # Normalize keywords
        words = set(text.lower().split())  # Convert to set to remove duplicates
        words = words - generic_terms  # Remove generic terms
        return ' '.join(sorted(words))  # Sort to make order irrelevant
    
    topics_df['clean_keywords'] = topics_df['clean_keywords'].apply(clean_keywords)
    
    # Remove topics with too few meaningful words
    topics_df = topics_df[topics_df['clean_keywords'].str.split().str.len() >= 2]
    
    return topics_df

In [9]:
meaningful_topics = get_meaningful_topics(topics_df, min_count=100, top_n_per_year=200)
meaningful_topics = preprocess_topics(meaningful_topics)

print("\nSample of meaningful topics:")
print(meaningful_topics[['year', 'topic_id', 'clean_keywords', 'count']].sort_values('count', ascending=False).head(20))

Processing meaningful topics...
After initial filtering: 3474389 topics
After removing duplicates: 3330 topics
Final number of significant topics: 800

Topics per year:
year
2020    200
2021    200
2022    200
2023    200
dtype: int64

Sample of meaningful topics:
     year  topic_id                        clean_keywords  count
0    2020        -1                        opposition pap  84568
200  2021        -1                            covid work  74628
202  2021        -1                       covid time work  64119
203  2021        -1                         chinese covid  61667
204  2021        -1                      covid vaccinated  60027
1    2020        -1                    government workers  58513
207  2021        -1  covid vaccinated vaccination vaccine  56142
3    2020        -1                              job work  53876
4    2020        -1                          job pay work  53387
9    2020         0                             dont work  44767
403  2022        -1 

In [10]:
def find_semantic_clusters(topics_df, min_similarity=0.6):
    # Vectorize with better parameters
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  # Include bigrams
        min_df=2,  # Minimum document frequency
        max_df=0.9  # Maximum document frequency
    )
    
    vectors = vectorizer.fit_transform(topics_df['clean_keywords'])
    
    # Find similar topics
    nn = NearestNeighbors(n_neighbors=min(10, len(topics_df)), 
                         metric='cosine', 
                         algorithm='brute')
    nn.fit(vectors)
    distances, indices = nn.kneighbors(vectors)
    
    # Create graph with stricter similarity threshold
    G = nx.Graph()
    
    for i in range(len(topics_df)):
        G.add_node(i, 
                  year=topics_df.iloc[i]['year'],
                  keywords=topics_df.iloc[i]['clean_keywords'],
                  count=topics_df.iloc[i]['count'])
    
    for i in range(len(indices)):
        for j, dist in zip(indices[i][1:], distances[i][1:]):
            similarity = 1 - dist
            if similarity > min_similarity:
                G.add_edge(i, j, weight=similarity)
    
    # Use a different community detection algorithm
    communities = nx.community.greedy_modularity_communities(G)
    
    return communities, G, vectorizer.get_feature_names_out()

In [11]:
communities, G, feature_names = find_semantic_clusters(meaningful_topics, min_similarity=0.5)

In [19]:
def analyze_semantic_clusters(topics_df, communities):
    cluster_analysis = []
    
    for cluster_id, community in enumerate(communities):
        if len(community) < 2:
            continue
            
        cluster_topics = topics_df.iloc[list(community)]
        
        # Calculate cluster coherence
        keywords = ' '.join(cluster_topics['clean_keywords'])
        unique_keywords = set(keywords.split())
        
        # Get temporal evolution by month
        monthly_counts = cluster_topics.groupby('yearmonth')['count'].sum()
        temporal_evolution = {
            year: {
                f"{year}-{month:02d}": {
                    'post_count': monthly_counts.get(f"{year}-{month:02d}", 0)
                }
                for month in range(1, 13)
            }
            for year in cluster_topics['year'].unique()
        }
        
        # Calculate toxicity evolution by month
        toxicity_monthly = cluster_topics.groupby('yearmonth')['toxicity_score'].agg(['mean', 'count']).round(4)
        toxicity_evolution = {
            year: {
                month: {
                    'avg_toxicity': stats['mean'],
                    'post_count': stats['count']
                }
                for month, stats in toxicity_monthly[toxicity_monthly.index.str.startswith(year)].iterrows()
            }
            for year in cluster_topics['year'].unique()
        }
        
        # Calculate topic diversity
        topic_diversity = len(unique_keywords) / len(community)
        
        cluster_analysis.append({
            'cluster_id': cluster_id,
            'size': len(community),
            'total_posts': cluster_topics['count'].sum(),
            'unique_keywords': list(unique_keywords),
            'topic_diversity': topic_diversity,
            'temporal_evolution': temporal_evolution,
            'toxicity_evolution': toxicity_evolution,
            'avg_toxicity': cluster_topics['toxicity_score'].mean(),
            'sample_topics': cluster_topics.nlargest(3, 'count')['clean_keywords'].tolist(),
            'topic_indices': list(community)
        })
    
    return pd.DataFrame(cluster_analysis)

In [20]:
cluster_analysis = analyze_semantic_clusters(meaningful_topics, communities)

In [21]:
cluster_analysis.head()

Unnamed: 0,cluster_id,size,total_posts,unique_keywords,topic_diversity,temporal_evolution,toxicity_evolution,avg_toxicity,sample_topics,topic_indices
0,0,22,141317,"[viruses, covids, quarantines, coronavirus, si...",1.227273,"{'2021': {'2021-01': {'post_count': 0}, '2021-...","{'2021': {'2021-05': {'avg_toxicity': 0.0441, ...",0.050925,"[covid vaccinated, covid vaccinated vaccinatio...","[192, 321, 66, 195, 514, 388, 637, 653, 529, 8..."
1,1,20,14679,"[hdb, asset, sale, sell, rents, mortgage, bloc...",1.35,"{'2021': {'2021-01': {'post_count': 384}, '202...","{'2021': {'2021-01': {'avg_toxicity': 0.0461, ...",0.086916,"[bto btos hdb housing, btos flats hdb valuatio...","[194, 707, 196, 519, 712, 649, 703, 278, 603, ..."
2,2,17,9207,"[sgs, sgsecure, sgreans, sgd, sgraw, sgean, sg...",0.764706,"{'2022': {'2022-01': {'post_count': 0}, '2022-...","{'2022': {'2022-07': {'avg_toxicity': 0.0595, ...",0.049124,"[rsg sgraw sgs, sgeans sgraw sgs, sgraw sgs sg...","[481, 389, 425, 201, 75, 409, 587, 655, 559, 6..."
3,3,15,8096,"[bikers, cyclists, cyclist, motorcyclists, ped...",1.133333,"{'2023': {'2023-01': {'post_count': 291}, '202...","{'2023': {'2023-01': {'avg_toxicity': 0.0416, ...",0.056845,"[driving pedestrian pedestrians traffic, drivi...","[577, 70, 264, 9, 235, 14, 591, 497, 689, 52, ..."
4,4,12,7217,"[masked, mask, maskless, masking, maskoff, mas...",0.75,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-08': {'avg_toxicity': 0.0425, ...",0.086982,"[mask masking maskoff masks, mask maskless mas...","[128, 385, 223, 132, 267, 109, 302, 210, 406, ..."


In [22]:
# sort by total_post
cluster_analysis.sort_values('total_posts', ascending=False).head(10)

Unnamed: 0,cluster_id,size,total_posts,unique_keywords,topic_diversity,temporal_evolution,toxicity_evolution,avg_toxicity,sample_topics,topic_indices
56,56,3,183014,"[covid, time, work]",1.0,"{'2021': {'2021-01': {'post_count': 0}, '2021-...","{'2021': {'2021-09': {'avg_toxicity': 0.0539, ...",0.04237,"[covid work, covid time work, covid work]","[184, 369, 183]"
42,42,4,150279,"[employment, jobs, pay, work, job]",1.25,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-02': {'avg_toxicity': 0.0287, ...",0.039404,"[job work, job pay work, job work]","[2, 3, 5, 22]"
0,0,22,141317,"[viruses, covids, quarantines, coronavirus, si...",1.227273,"{'2021': {'2021-01': {'post_count': 0}, '2021-...","{'2021': {'2021-05': {'avg_toxicity': 0.0441, ...",0.050925,"[covid vaccinated, covid vaccinated vaccinatio...","[192, 321, 66, 195, 514, 388, 637, 653, 529, 8..."
79,79,2,82375,"[people, dont, work]",1.5,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-02': {'avg_toxicity': 0.0393, ...",0.033578,"[dont work, dont people]","[4, 6]"
9,9,10,65735,"[mandarin, zhongguo, translation, korean, cant...",2.1,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-06': {'avg_toxicity': 0.2484, ...",0.055862,"[chinese covid, cantonese chinese dialects man...","[41, 681, 44, 237, 304, 82, 53, 185, 442, 668]"
54,54,3,59198,"[ndp, governments, govt, ndps, workers, gov, c...",2.666667,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-04': {'avg_toxicity': 0.0356, ...",0.039268,"[government workers, gov government government...","[1, 282, 334]"
1,1,20,14679,"[hdb, asset, sale, sell, rents, mortgage, bloc...",1.35,"{'2021': {'2021-01': {'post_count': 384}, '202...","{'2021': {'2021-01': {'avg_toxicity': 0.0461, ...",0.086916,"[bto btos hdb housing, btos flats hdb valuatio...","[194, 707, 196, 519, 712, 649, 703, 278, 603, ..."
2,2,17,9207,"[sgs, sgsecure, sgreans, sgd, sgraw, sgean, sg...",0.764706,"{'2022': {'2022-01': {'post_count': 0}, '2022-...","{'2022': {'2022-07': {'avg_toxicity': 0.0595, ...",0.049124,"[rsg sgraw sgs, sgeans sgraw sgs, sgraw sgs sg...","[481, 389, 425, 201, 75, 409, 587, 655, 559, 6..."
3,3,15,8096,"[bikers, cyclists, cyclist, motorcyclists, ped...",1.133333,"{'2023': {'2023-01': {'post_count': 291}, '202...","{'2023': {'2023-01': {'avg_toxicity': 0.0416, ...",0.056845,"[driving pedestrian pedestrians traffic, drivi...","[577, 70, 264, 9, 235, 14, 591, 497, 689, 52, ..."
4,4,12,7217,"[masked, mask, maskless, masking, maskoff, mas...",0.75,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-08': {'avg_toxicity': 0.0425, ...",0.086982,"[mask masking maskoff masks, mask maskless mas...","[128, 385, 223, 132, 267, 109, 302, 210, 406, ..."


In [23]:
# sort by average toxicity score
cluster_analysis.sort_values('avg_toxicity', ascending=False).head(20)

Unnamed: 0,cluster_id,size,total_posts,unique_keywords,topic_diversity,temporal_evolution,toxicity_evolution,avg_toxicity,sample_topics,topic_indices
99,99,2,1118,"[cops, policeman, arrest, police, reports]",2.5,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-12': {'avg_toxicity': 0.0339, ...",0.216664,"[arrest cops police policeman, arrest cops pol...","[26, 285]"
142,142,2,797,"[footballing, football, footballers, footballer]",2.0,"{'2023': {'2023-01': {'post_count': 273}, '202...","{'2023': {'2023-01': {'avg_toxicity': 0.2918, ...",0.175611,"[football footballer footballers footballing, ...","[705, 234]"
141,141,2,1176,"[marijuana, cannabis, drugs, weed]",2.0,"{'2023': {'2023-01': {'post_count': 0}, '2023-...","{'2023': {'2023-05': {'avg_toxicity': 0.2422, ...",0.173767,"[cannabis drugs marijuana weed, cannabis drugs...","[563, 686]"
59,59,3,2486,"[crimea, ukraines, ukraine, nato]",1.333333,"{'2022': {'2022-01': {'post_count': 0}, '2022-...","{'2022': {'2022-02': {'avg_toxicity': 0.031, '...",0.149231,"[crimea nato ukraine ukraines, crimea nato ukr...","[416, 428, 374]"
134,134,2,1541,"[role, nathan, tharman, tharmans, him, tharma]",3.0,"{'2023': {'2023-01': {'post_count': 0}, '2023-...","{'2023': {'2023-06': {'avg_toxicity': 0.0382, ...",0.141764,"[him role tharman tharmans, nathan tharma thar...","[651, 558]"
125,125,2,1173,"[funny, jokes, laughed, humor, comedic, joke]",3.0,"{'2021': {'2021-01': {'post_count': 0}, '2021-...","{'2021': {'2021-05': {'avg_toxicity': 0.041, '...",0.129979,"[comedic humor joke jokes, funny joke jokes la...","[306, 566]"
31,31,6,3953,"[homosexual, repeal, homosexuality, transgende...",2.0,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-06': {'avg_toxicity': 0.0356, ...",0.123047,"[homosexuality lgbt lgbtq repeal, homophobic h...","[96, 530, 371, 404, 40, 734]"
130,130,2,989,"[filming, cctv, cctvs, camera, consent, camera...",3.5,"{'2023': {'2023-01': {'post_count': 0}, '2023-...","{'2023': {'2023-08': {'avg_toxicity': 0.0618, ...",0.120415,"[camera cameras cctv cctvs, camera consent fil...","[616, 590]"
28,28,6,3338,"[wage, payouts, pays, income, unemployment, sa...",1.666667,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-02': {'avg_toxicity': 0.0497, ...",0.11157,"[income unemployment wage wages, payouts pays ...","[125, 646, 71, 8, 300, 461]"
61,61,3,1708,"[secular, christianity, religion, religious, r...",1.666667,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-10': {'avg_toxicity': 0.203, '...",0.107257,"[religion religions religious secular, christi...","[73, 386, 457]"


In [24]:
# Print results with better formatting
print("\nMeaningful Topic Clusters:")
for _, cluster in cluster_analysis.sort_values('total_posts', ascending=False).head(10).iterrows():
    print(f"\nCluster {cluster['cluster_id']}")
    print(f"Size: {cluster['size']} topics")
    print(f"Total posts: {cluster['total_posts']:,}")
    print(f"Topic diversity: {cluster['topic_diversity']:.2f}")
    print("Unique keywords:", ', '.join(sorted(cluster['unique_keywords'])))
    print("Sample topics:", cluster['sample_topics'])
    print("Temporal evolution:", cluster['temporal_evolution'])
    print("-" * 50)


Meaningful Topic Clusters:

Cluster 56
Size: 3 topics
Total posts: 183,014
Topic diversity: 1.00
Unique keywords: covid, time, work
Sample topics: ['covid work', 'covid time work', 'covid work']
Temporal evolution: {'2021': {'2021-01': {'post_count': 0}, '2021-02': {'post_count': 0}, '2021-03': {'post_count': 0}, '2021-04': {'post_count': 0}, '2021-05': {'post_count': 0}, '2021-06': {'post_count': 0}, '2021-07': {'post_count': 0}, '2021-08': {'post_count': 0}, '2021-09': {'post_count': 64119}, '2021-10': {'post_count': 74628}, '2021-11': {'post_count': 0}, '2021-12': {'post_count': 0}}, '2022': {'2022-01': {'post_count': 0}, '2022-02': {'post_count': 44267}, '2022-03': {'post_count': 0}, '2022-04': {'post_count': 0}, '2022-05': {'post_count': 0}, '2022-06': {'post_count': 0}, '2022-07': {'post_count': 0}, '2022-08': {'post_count': 0}, '2022-09': {'post_count': 0}, '2022-10': {'post_count': 0}, '2022-11': {'post_count': 0}, '2022-12': {'post_count': 0}}}
-------------------------------

In [25]:
# save data
cluster_analysis.to_csv('D:\\Github_Repos\\dsa4264\\/data/topic_clusters.csv', index=False)

In [27]:
cluster = pd.read_csv('D:\\Github_Repos\\dsa4264\\/data/topic_clusters.csv')

In [28]:
cluster.head()

Unnamed: 0,cluster_id,size,total_posts,unique_keywords,topic_diversity,temporal_evolution,toxicity_evolution,avg_toxicity,sample_topics,topic_indices
0,0,22,141317,"['viruses', 'covids', 'quarantines', 'coronavi...",1.227273,"{'2021': {'2021-01': {'post_count': 0}, '2021-...","{'2021': {'2021-05': {'avg_toxicity': 0.0441, ...",0.050925,"['covid vaccinated', 'covid vaccinated vaccina...","[192, 321, 66, 195, 514, 388, 637, 653, 529, 8..."
1,1,20,14679,"['hdb', 'asset', 'sale', 'sell', 'rents', 'mor...",1.35,"{'2021': {'2021-01': {'post_count': 384}, '202...","{'2021': {'2021-01': {'avg_toxicity': 0.0461, ...",0.086916,"['bto btos hdb housing', 'btos flats hdb valua...","[194, 707, 196, 519, 712, 649, 703, 278, 603, ..."
2,2,17,9207,"['sgs', 'sgsecure', 'sgreans', 'sgd', 'sgraw',...",0.764706,"{'2022': {'2022-01': {'post_count': 0}, '2022-...","{'2022': {'2022-07': {'avg_toxicity': 0.0595, ...",0.049124,"['rsg sgraw sgs', 'sgeans sgraw sgs', 'sgraw s...","[481, 389, 425, 201, 75, 409, 587, 655, 559, 6..."
3,3,15,8096,"['bikers', 'cyclists', 'cyclist', 'motorcyclis...",1.133333,"{'2023': {'2023-01': {'post_count': 291}, '202...","{'2023': {'2023-01': {'avg_toxicity': 0.0416, ...",0.056845,"['driving pedestrian pedestrians traffic', 'dr...","[577, 70, 264, 9, 235, 14, 591, 497, 689, 52, ..."
4,4,12,7217,"['masked', 'mask', 'maskless', 'masking', 'mas...",0.75,"{'2020': {'2020-01': {'post_count': 0}, '2020-...","{'2020': {'2020-08': {'avg_toxicity': 0.0425, ...",0.086982,"['mask masking maskoff masks', 'mask maskless ...","[128, 385, 223, 132, 267, 109, 302, 210, 406, ..."
