# Community Keyword Extractor

In [1]:
!pip install keybert



In [2]:
!python3 -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/sallycaoyu/Downloads/anaconda3/lib/python3.7/site-packages/en_core_web_sm
-->
/Users/sallycaoyu/Downloads/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [8]:
import pickle
import re
import numpy as np
import spacy
import keybert
from collections import defaultdict, Counter
from keybert import KeyBERT

In [4]:
with open('community_nodes.pkl', 'rb') as file:
    data = pickle.load(file)

In [5]:
nlp = spacy.load('en')
raw_data = defaultdict(str)
for node in data:
    raw_data[node['name']] = nlp(node['description']) # tokenize into words

In [42]:
def preprocess_doc(doc):
    '''remove stopwords, punctuations, whitespaces, all non-alphanumeric chars'''
    proc_doc = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.pos_ != 'SPACE':
            processed_token = token.lower_
            
            # remove reddit prefix of subreddit
            processed_token = processed_token.replace('/r/', ' ').replace('/r',' ').replace('r/', ' ')
            
            # remove reddit prefix of username
            processed_token = processed_token.replace('/u/', ' ').replace('/u',' ').replace('u/', ' ')
            
            # remove urls
            processed_token = re.sub(r'http\S+', ' ', processed_token)
            
            # remove all non-alphanumeric chars
            processed_token = re.sub(r'[^0-9a-zA-Z]+', ' ', processed_token)
            
            if processed_token != '' and processed_token != 'r':
                proc_doc.append(processed_token)
                
    return ' '.join(proc_doc)

In [43]:
preprocessed_data = defaultdict(str)
for name, description in raw_data.items():
    preprocessed_data[name] = preprocess_doc(description)

In [44]:
preprocessed_data

defaultdict(str,
            {'Bitcoin': 'bitcoin currency internet distributed worldwide decentralized digital money unlike traditional currencies dollars bitcoins issued managed central authority whatsoever government company bank charge bitcoin resistant wild inflation corrupt banks bitcoin bank  new bitcoin check use coins  en  bitcoin org  explore bitcoin wiki  wiki main page invest recklessly  bitcoin comments 7gi55s dont invest recklessly  getting started  en getting started faq wiki  wiki faq resources  bitcoin information html common myths  wiki myths buy bitcoins worldwide  bitcoin medium exchange  wiki bitcoin as a medium of exchange earn money mining bitcoin  bitcoin comments 18r5qc will i earn money by mining an answer to all  bitcoin investment  wiki bitcoin as an investment storing bitcoins  wiki storing bitcoins kept gardens die pacifism  lw c1 wellkept gardens die by pacifism  cypherpunk manifesto  cypherpunk manifesto html community guidelines use url shortening servi

In [55]:
kw_model = KeyBERT()
for community, desc in preprocessed_data.items():
    keywords = kw_model.extract_keywords(desc, keyphrase_ngram_range=(1, 1), stop_words='english',
                                         use_maxsum=True, nr_candidates=10, top_n=5)
    print('keywords for r/{}:'.format(community), keywords)

keywords for r/music: [('edditmusic', 0.3351), ('musician', 0.339), ('classicalmusic', 0.3449), ('songs', 0.4023), ('musicsubreddits', 0.4375)]
keywords for r/drama: []
keywords for r/relationship_advice: [('requests', 0.2508), ('moral', 0.2637), ('askwomenover30', 0.2784), ('asshole', 0.2785), ('counseling', 0.3176)]
keywords for r/tifu: [('sfwfhu', 0.3055), ('pranks', 0.3106), ('ban', 0.3383), ('tifupdate', 0.3469), ('nsfw', 0.37)]
keywords for r/Bitcoin: [('btc', 0.4101), ('bitcoindiscussion', 0.4358), ('bitcoinmining', 0.4473), ('bitcoinbeginners', 0.4741), ('cryptocurrencies', 0.4895)]
keywords for r/musicals: [('performing', 0.2968), ('actresses', 0.2973), ('germanmusicals', 0.3727), ('soundtracks', 0.4132), ('theatre', 0.4805)]
keywords for r/mbti: [('introvert', 0.2499), ('mbti', 0.2516), ('jtypes2', 0.2526), ('mypersonality', 0.2597), ('humanmetrics', 0.3014)]
keywords for r/wallstreetbets: [('daily', 0.2137), ('aiwygk', 0.2233), ('contentguide', 0.2271), ('wsb', 0.307), ('wal

In [56]:
for community, desc in preprocessed_data.items():
    keywords = kw_model.extract_keywords(desc, keyphrase_ngram_range=(1, 1), stop_words='english',top_n=5)
    print('keywords for r/{}:'.format(community), keywords)

keywords for r/music: [('musicsubreddits', 0.4375), ('music', 0.4087), ('songs', 0.4023), ('redditmusic', 0.3846), ('musicians', 0.3561)]
keywords for r/drama: []
keywords for r/relationship_advice: [('counseling', 0.3176), ('abusive', 0.2931), ('asshole', 0.2785), ('askwomenover30', 0.2784), ('advice', 0.2707)]
keywords for r/tifu: [('vulgar', 0.4473), ('tifucirclejerk', 0.3765), ('nsfw', 0.37), ('porn', 0.3653), ('tifu', 0.3645)]
keywords for r/Bitcoin: [('bitcoin', 0.5899), ('bitcoins', 0.5593), ('cryptocurrencies', 0.4895), ('cryptocurrency', 0.4871), ('bitcoinmarkets', 0.4864)]
keywords for r/musicals: [('musicals', 0.6471), ('musical', 0.5475), ('theatre', 0.4805), ('theater', 0.4797), ('soundtracks', 0.4132)]
keywords for r/mbti: [('types', 0.4328), ('mbtitypeme', 0.4004), ('personality', 0.3792), ('type', 0.3665), ('humanmetrics', 0.3014)]
keywords for r/wallstreetbets: [('twitter', 0.5265), ('wallstreetbets', 0.4572), ('discord', 0.3124), ('wsb', 0.307), ('fmsnby', 0.2492)]
ke