In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from generic_db import GenericDBOperations
import re  # 746 used to be doc count

In [2]:
pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.max_colwidth', None)

In [3]:
generic_db = GenericDBOperations()

In [4]:
q = '''
    select comment_id, body, parent_id, submission_id, score
    from comments
    -- where body is not null and body != '' and body != '[deleted]'
    '''
comments_all = generic_db.execute_query(query=q, fetch_all=True)
comments_cols = ['comment_id', 'body', 'parent_id', 'submission_id', 'score']
comments_all = pd.DataFrame(data=comments_all, columns=comments_cols)
comments_all

Unnamed: 0,comment_id,body,parent_id,submission_id,score
0,t1_jt9ubb9,Air conditioning now? Im not sacrificing a sin...,t3_158fnlf,t3_158fnlf,99
1,t1_jta03kq,"Without air conditioner, it would be worse. Ac...",t3_158fnlf,t3_158fnlf,22
2,t1_jt9z2i7,>A man uses his mobile phone as he sits amid o...,t3_158fnlf,t3_158fnlf,40
3,t1_jt9o0uh,Shouldnt the duck curve of solar work fairly f...,t3_158fnlf,t3_158fnlf,15
4,t1_jta7c5o,Air conditioning was literally voted the numbe...,t3_158fnlf,t3_158fnlf,10
...,...,...,...,...,...
31121,t1_jgo7rx4,"Thanks, missed that.",t1_jgo7orm,t3_12pzg2b,1
31122,t1_jh405ve,Did you ever get your Creedence tapes back?,t1_jgpgbvx,t3_12pzg2b,2
31123,t1_jgqkkrz,+1 for Howl.\n\nTheyre doing some of best thin...,t1_jgp097g,t3_12pzg2b,5
31124,t1_jgpaefn,Howl rules.,t1_jgp097g,t3_12pzg2b,4


In [5]:
q = '''
    select submissions.submission_id, submissions.title, submissions.selftext, subreddits.display_name, submissions.score
    from submissions
    left join subreddits
    on submissions.subreddit_id = subreddits.subreddit_id
    -- where body is not null and body != '' and body != '[deleted]'
    '''
submissions_all = generic_db.execute_query(query=q, fetch_all=True)
submissions_cols = ['submission_id', 'title', 'selftext', 'subreddit_name', 'score']
submissions_all = pd.DataFrame(data=submissions_all, columns=submissions_cols)
submissions_all

Unnamed: 0,submission_id,title,selftext,subreddit_name,score
0,t3_158fnlf,Why air conditioners can be a problematic solu...,,canada,0
1,t3_14iqqjb,Grasshoppers threaten to devour Alberta crops ...,,canada,186
2,t3_13h9pua,Canada: extreme ‘heat dome’ temperatures set t...,,canada,92
3,t3_15ln2au,Ontario proposing new heat stress regulation t...,,canada,108
4,t3_15tp7b9,B.C. officials warn of extreme fire behaviour ...,,canada,21
...,...,...,...,...,...
603,t3_11bldoe,Insane new band out of Victoria - The Bankes B...,It’s on all streaming platforms! Here’s there ...,VictoriaBC,0
604,t3_15s6r0n,Best beaches for the hot weather turn,If youre like us and you dont have AC in your ...,VictoriaBC,2
605,t3_13u41ao,"Choosing to believe the weather network, I’m a...",,VictoriaBC,0
606,t3_14xodmu,Every Monday at six Classic cars and Hot Rods ...,,VictoriaBC,8


In [6]:
class Node:
    def __init__(self, id, text, sub, score):
        self.id = id
        self.text = text
        self.sub = sub
        self.score = score
        self.children = []

    def __str__(self):
        return f'{self.sub}: {self.text[:50]}'

    def __repr__(self):
        return f'{self.sub}: {self.text[:50]}'

class Tree:
    def __init__(self, root_id, text, sub, score):
        self.root = Node(root_id, text, sub, score)
        self.nodes = {root_id: self.root}

    def add_comment(self, id, parent_id, text, sub, score):
        new_node = Node(id, text, sub, score)
        parent_node = self.nodes.get(parent_id)
        if parent_node:
            parent_node.children.append(new_node)
            self.nodes[id] = new_node
            
    def bfs_traversal(self):
        results = []
        queue = [self.root]
        while queue:
            current_node = queue.pop(0)
            results.append(current_node)
            # print(current_node.id, current_node.text)
            queue.extend(current_node.children)
        return results

    def dfs_traversal(self, node=None):
        if node is None:
            node = self.root
        print(node.id)
        for child in node.children:
            self.dfs_traversal(child)

In [7]:
def clean_text(text):
    # Remove newline breaks
    text = text.replace('\n', ' ')
    # Remove markdown style URLs [text](http://url)
    text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Replace extra whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # Replace "[deleted]" with a space
    text = text.replace("[deleted]", " ")
    return text

In [8]:
sample = ''' Im Tara Carman, an investigative journalist for CBC News and for eight weeks over the summer, I led a project to see how hot it got in Canadian homes without AC or cooling. AMA! In our Urban Heat investigation, we found people living at temperatures experts consider dangerous for some people, sometimes for days or weeks at a time.

There isnt much data in Canada about how hot it gets indoors. But we know indoor heat can be deadly — almost all the 619 people who died during BCs summer 2021 heat dome died inside. So we set out to collect that data, putting 50 temperature and humidity sensors in the homes of people in five cities (Vancouver, Winnipeg, Toronto, Windsor and Montreal) with little or no central cooling. They took measurements every 10 minutes. 

We found some people got no relief at night, with temperatures peaking on average at 7 pm and taking several hours to cool down. For some, the health effects, both physical and mental, were significant. And one of our participants, an 88-year-old man in B.C., did not survive the summer.

Proof: [https://x.com/CBCNews/status/1706706637715546472?s=20](https://x.com/CBCNews/status/1706706637715546472?s=20) 

Read our full investigation here: [https://www.cbc.ca/newsinteractives/features/no-escape-from-the-heat](https://www.cbc.ca/newsinteractives/features/no-escape-from-the-heat)

Listen to the What on Earth podcast (Sept. 10) here: [https://www.cbc.ca/listen/live-radio/1-429-what-on-earth/clip/16008365-cbc-tracked-heat-dozens-homes.-heres-learned](https://www.cbc.ca/listen/live-radio/1-429-what-on-earth/clip/16008365-cbc-tracked-heat-dozens-homes.-heres-learned) This year I wanted to install a heat pump to get some cooling in my house. When I checked the Federal govt rebate site, it told me that because I wanted to do the work in 0-3 months (credit card in hand, lets do this!), I probably wouldnt qualify as the process takes about a year to get approved! Has there been any investigation as to why this process takes so long? If climate change really is an emergency, why is the government grant system so painfully slow? Can you speak to any mitigation strategies some of your participants might have used (cooling fans, evaporation fans, etc.) and how effective they might have been? How hot did it get in peoples homes without AC or cooling? I am sorry. What was the purpose of this investigation? Is there something in the building code that needs to be changed? Why does this obvious fact require an investigative report?'''
sample

' Im Tara Carman, an investigative journalist for CBC News and for eight weeks over the summer, I led a project to see how hot it got in Canadian homes without AC or cooling. AMA! In our Urban Heat investigation, we found people living at temperatures experts consider dangerous for some people, sometimes for days or weeks at a time.\n\nThere isnt much data in Canada about how hot it gets indoors. But we know indoor heat can be deadly — almost all the 619 people who died during BCs summer 2021 heat dome died inside. So we set out to collect that data, putting 50 temperature and humidity sensors in the homes of people in five cities (Vancouver, Winnipeg, Toronto, Windsor and Montreal) with little or no central cooling. They took measurements every 10 minutes. \n\nWe found some people got no relief at night, with temperatures peaking on average at 7 pm and taking several hours to cool down. For some, the health effects, both physical and mental, were significant. And one of our participan

In [9]:
clean_text(sample)

' Im Tara Carman, an investigative journalist for CBC News and for eight weeks over the summer, I led a project to see how hot it got in Canadian homes without AC or cooling. AMA! In our Urban Heat investigation, we found people living at temperatures experts consider dangerous for some people, sometimes for days or weeks at a time. There isnt much data in Canada about how hot it gets indoors. But we know indoor heat can be deadly — almost all the 619 people who died during BCs summer 2021 heat dome died inside. So we set out to collect that data, putting 50 temperature and humidity sensors in the homes of people in five cities (Vancouver, Winnipeg, Toronto, Windsor and Montreal) with little or no central cooling. They took measurements every 10 minutes. We found some people got no relief at night, with temperatures peaking on average at 7 pm and taking several hours to cool down. For some, the health effects, both physical and mental, were significant. And one of our participants, an 

In [10]:
roots = []
for i, r in submissions_all.iterrows():
    submission_id = str(r['submission_id'])
    title = clean_text(str(r['title']))
    selftext = clean_text(str(r['selftext']))
    subreddit_name = str(r['subreddit_name'])
    if subreddit_name == 'Quebec':
        continue
    score = int(r['score'])
    children = comments_all[(comments_all['parent_id']==submission_id)&(~comments_all['body'].str.contains('am a bot'))]
    tree = Tree(submission_id, f'{title} {selftext}', subreddit_name, score)
    for ci, cr in children.iterrows():
        comment_id = str(cr['comment_id'])
        parent_id = str(cr['parent_id'])
        comment_text = clean_text(str(cr['body']))
        comment_score = int(cr['score'])
        tree.add_comment(comment_id, parent_id, comment_text, subreddit_name, comment_score)
    roots.append(tree)

In [11]:
len(roots)

607

In [12]:
documents = []
subs = []
scores = []
doc_counts = []
for tree in roots:
    docs = tree.bfs_traversal()
    concat_docs = ''
    score = 0
    doc_count = 0
    for doc in docs:
        doc_len = len(doc.text.split())
        # print(doc_len, len(concat_docs.split()), score, doc_count)
        
        if doc_len >= 384:
            if len(concat_docs) != 0:
                documents.append(concat_docs)
                subs.append(tree.root.sub)
                scores.append(score)
                doc_counts.append(doc_count)
            documents.append(doc.text)
            subs.append(tree.root.sub)
            scores.append(doc.score)
            doc_counts.append(1)
            concat_docs = ''
            score = 0
            doc_count = 0
            continue
            
        if len(concat_docs.split()) + doc_len <= 384:
            concat_docs += f' {doc.text}'
            score += doc.score
            doc_count += 1
        else:
            documents.append(concat_docs)
            subs.append(tree.root.sub)
            scores.append(score)
            doc_counts.append(doc_count)
            concat_docs = doc.text
            score = doc.score
            doc_count = 1
    if len(concat_docs) != 0:
        documents.append(concat_docs)
        subs.append(tree.root.sub)
        scores.append(score)
        doc_counts.append(doc_count)

In [13]:
len(documents)

1352

In [14]:
len(subs)

1352

In [15]:
len(scores)

1352

In [16]:
len(doc_counts)

1352

In [17]:
np.mean([len(doc.split()) for doc in documents])

270.5687869822485

In [18]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(top_n_words=20, nr_topics="auto", vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)

# topic_model = BERTopic(top_n_words=20, nr_topics=20)

topic_model.fit_transform(documents)
top_words = pd.DataFrame.from_dict(topic_model.get_topics())
doc_info = topic_model.get_document_info(documents)

In [19]:
top_words.to_csv(f'./top_words.csv')

In [20]:
doc_info['sub'] = subs
doc_info['avg_score'] = scores
doc_info['doc_count'] = doc_counts
doc_info['avg_score'] = doc_info['avg_score'] / doc_info['doc_count']

In [21]:
doc_info.to_csv('doc_info.csv')

In [22]:
top_words

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,"(like, 0.2244618445948822)","(pump, 0.29888594275114344)","(weather, 0.29851154591099843)","(conditioners, 0.37641895766773337)","(tax, 0.5051447270982603)","(walmart, 0.568552856344159)","(landlord, 0.5645606467029223)","(ac, 0.48084886786173386)","(cats, 0.5262612845194458)","(wear, 0.5368611158901346)","(grid, 0.5115634807362508)","(turn, 0.4734580735118417)","(cool, 0.4438488367412764)","(love, 0.6924908676072336)","(schools, 0.6830187990182194)"
1,"(people, 0.2233145350480538)","(furnace, 0.27465276709725495)","(climate, 0.2956741557265649)","(people, 0.34605375269931465)","(carbon, 0.47990689422627036)","(prices, 0.5581658607865235)","(landlords, 0.515414232200008)","(windows, 0.40842850191630464)","(cat, 0.5095229074212726)","(wearing, 0.509817442331441)","(power, 0.46576651635071464)","(turned, 0.46857312491593617)","(pillow, 0.42499410213694583)","(30, 0.44360095855670506)","(manitoba, 0.527561435532408)"
2,"(building, 0.21071814131141728)","(pumps, 0.26221647748174487)","(rain, 0.2777201087653187)","(health, 0.33872524126904185)","(moe, 0.4583220830909235)","(food, 0.5019103419465891)","(rent, 0.46055853788035866)","(curtains, 0.38295681453021246)","(dog, 0.4731065866683148)","(hoodie, 0.49774532461132065)","(nuclear, 0.4016580878207932)","(valve, 0.43615342061035794)","(ice, 0.41704122734475224)","(hate, 0.4398318666735408)","(strike, 0.48494629876362166)"
3,"(window, 0.20799086877483378)","(gas, 0.2595168519477259)","(records, 0.27547439105035015)","(government, 0.3060134028323394)","(saskatchewan, 0.4027379807080619)","(grocery, 0.46913972649264263)","(tenants, 0.4446815910228209)","(window, 0.36776041477642135)","(pets, 0.4305704012669154)","(pants, 0.4838575160381516)","(shortages, 0.394120758562022)","(condo, 0.42696688389214965)","(pools, 0.41380873383556377)","(like, 0.4305646781459276)","(school, 0.47222976832410474)"
4,"(day, 0.20701489770608641)","(heat, 0.2500821274336333)","(wave, 0.2753833979810328)","(review, 0.3053842128406811)","(sask, 0.35230104403762386)","(costco, 0.4647962588414424)","(building, 0.39597229338682716)","(fans, 0.3451894234463792)","(fan, 0.4076635736777594)","(wool, 0.4643144740814345)","(pickering, 0.39312631276493004)","(baseboard, 0.38686378241094077)","(healthlink, 0.40163543717507194)","(humidity, 0.4232197270916339)","(cent, 0.46659111015585925)"
5,"(just, 0.20661975080952702)","(house, 0.23932221082453764)","(change, 0.2663326382225414)","(air, 0.30236088467521693)","(money, 0.3476431557709406)","(loblaws, 0.4479487937855466)","(tenant, 0.3594135595751847)","(sleep, 0.33936991424075397)","(ice, 0.3921300749384967)","(shoes, 0.44280881045409176)","(cars, 0.38260122627347626)","(apartment, 0.3797529125655526)","(water, 0.39727992139732354)","(outside, 0.4211734543234611)","(grade, 0.44158528444886674)"
6,"(hot, 0.206033377952424)","(oil, 0.23435292329008173)","(summer, 0.26568934351875423)","(seniors, 0.29917873609694456)","(feds, 0.3240418092260733)","(price, 0.42065909389197825)","(rental, 0.3262635172567014)","(portable, 0.3291252014593779)","(fans, 0.3871971721901643)","(merino, 0.4365311692388395)","(vehicles, 0.3591703593706092)","(floor, 0.3770681726614364)","(neck, 0.3777026199992237)","(weather, 0.398522113045095)","(union, 0.44060110503076094)"
7,"(ac, 0.20391643530198136)","(heating, 0.2306620891569059)","(fires, 0.2623838437680304)","(extreme, 0.2963389645670708)","(scott, 0.3169573053557435)","(inflation, 0.41146275929587856)","(buildings, 0.3092388182963341)","(hot, 0.32583916789966105)","(towel, 0.38145005050806746)","(skin, 0.4205202157740035)","(electric, 0.3537130125950813)","(plumber, 0.37214070903123647)","(stay, 0.37038080990482597)","(winter, 0.37662031982931204)","(workers, 0.4337783182485035)"
8,"(water, 0.20171202934819246)","(installed, 0.2295158226231568)","(temperatures, 0.2507726155797798)","(bc, 0.2934036493522193)","(levy, 0.3147129245408617)","(canadian, 0.389951102935269)","(rtb, 0.29956576553311315)","(cool, 0.3204500094899135)","(cubes, 0.3674609272731612)","(jeans, 0.4146045692201776)","(capacity, 0.34405822372465966)","(facing, 0.3702819094939562)","(warning, 0.3612383690783727)","(enjoy, 0.3696946580276295)","(bargaining, 0.42064027399466575)"
9,"(dont, 0.2015462546454274)","(cost, 0.22676569053088252)","(record, 0.24539894985618266)","(conditioning, 0.29035103043763094)","(province, 0.31174207458600767)","(profits, 0.38381667892452337)","(evict, 0.2946305356826764)","(bedroom, 0.31559260218046237)","(cool, 0.3646947813663399)","(hoodies, 0.3983391853702829)","(generating, 0.33495972433927534)","(warm, 0.3603081418994409)","(cooler, 0.3569347369837784)","(enjoying, 0.36844002856488495)","(education, 0.4140610678416651)"


In [23]:
doc_info

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,sub,avg_score,doc_count
0,Why air conditioners can be a problematic sol...,2,2_conditioners_people_health_government,"[conditioners, people, health, government, rev...",[Highlights: * The province was supposed to co...,conditioners - people - health - government - ...,1.000000,False,canada,29.285714,7
1,Bit of a alarmist clickbait headline. I agree ...,-1,-1_like_people_building_window,"[like, people, building, window, day, just, ho...","[In the meantime, turn on the oven at 450F and...",like - people - building - window - day - just...,0.000000,False,canada,10.000000,2
2,"I just lived thru a week of 45°c. No, Im not t...",2,2_conditioners_people_health_government,"[conditioners, people, health, government, rev...",[Highlights: * The province was supposed to co...,conditioners - people - health - government - ...,0.889142,False,canada,7.222222,9
3,Serious question for anyone who cares to answe...,2,2_conditioners_people_health_government,"[conditioners, people, health, government, rev...",[Highlights: * The province was supposed to co...,conditioners - people - health - government - ...,0.745981,False,canada,1.000000,3
4,Grasshoppers threaten to devour Alberta crops...,-1,-1_like_people_building_window,"[like, people, building, window, day, just, ho...","[In the meantime, turn on the oven at 450F and...",like - people - building - window - day - just...,0.000000,False,canada,26.444444,18
...,...,...,...,...,...,...,...,...,...,...,...
1347,"Choosing to believe the weather network, I’m ...",1,1_weather_climate_rain_records,"[weather, climate, rain, records, wave, change...",[However its been the worst fire season ever f...,weather - climate - rain - records - wave - ch...,1.000000,False,VictoriaBC,4.857143,7
1348,Every Monday at six Classic cars and Hot Rods...,1,1_weather_climate_rain_records,"[weather, climate, rain, records, wave, change...",[However its been the worst fire season ever f...,weather - climate - rain - records - wave - ch...,1.000000,False,VictoriaBC,4.500000,2
1349,Beer for long hot summer days Do any of the l...,-1,-1_like_people_building_window,"[like, people, building, window, day, just, ho...","[In the meantime, turn on the oven at 450F and...",like - people - building - window - day - just...,0.000000,False,VictoriaBC,16.545455,11
1350,Am I the only one who thinks we should have mo...,-1,-1_like_people_building_window,"[like, people, building, window, day, just, ho...","[In the meantime, turn on the oven at 450F and...",like - people - building - window - day - just...,0.000000,False,VictoriaBC,1.333333,18


In [24]:
doc_info['sub'].unique()

array(['canada', 'saskatoon', 'CanadaPolitics', 'onguardforthee',
       'Canada_sub', 'newfoundland', 'alberta', 'NovaScotia', 'ontario',
       'saskatchewan', 'PEI', 'britishcolumbia', 'newbrunswickcanada',
       'Manitoba', 'NWT', 'Yukon', 'Whitehorse', 'StJohnsNL', 'Calgary',
       'halifax', 'Winnipeg', 'Edmonton', 'toronto', 'ottawa', 'montreal',
       'regina', 'fredericton', 'moncton', 'vancouver', 'VictoriaBC'],
      dtype=object)

In [25]:
len(doc_info['sub'].unique())

30