In [15]:
import pandas as pd
# using tfidf to transform the text in the file into numbers/integers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
subreddit_df = pd.read_csv("subreddits.csv")
subreddit_df.head()

Unnamed: 0,name,type,title,description,subscribers,nsfw,quarantined,color,img_banner,img_icon,created_at,updated_at
0,000000000000o0000000,banned,000000000000O0000000,000000000000O0000000,,,,,,,2014-12-22 18:00:49+00,2022-09-18 21:24:12.370531+00
1,0015,public,0015,,1.0,f,f,,,,2022-06-01 12:05:00+00,2022-09-18 21:24:18.547475+00
2,007readalong,banned,007 Book Club reading one novel a month.,An online book club for our favorite double 0 ...,,,,,,,2016-12-18 03:51:39+00,2022-09-18 21:24:23.657976+00
3,00kat00,banned,Welcome to my own subreddit.,Hi everyone I'm 00kat00. I'm 19 I'm 5'6 112lbs...,,,,,,,2019-03-17 20:00:31+00,2022-09-18 21:24:29.852949+00
4,00scartoons,public,Your favorite cartoons from 2000 to 2009,A subreddit for cartoons that came out between...,513.0,f,f,,,,2012-10-30 03:32:29+00,2022-09-16 13:04:49.339553+00


In [31]:
# vectorize descriptions
tfidf = TfidfVectorizer()
title_matrix = tfidf.fit_transform(subreddit_df["title"].values.astype("U"))

# target subreddits to find similarities to
target_subreddit = ["cats", "dogs", "birds"]

# exclude certain keywords that appear in subreddit names
exclude_keywords = ["nsfw", "pussy", "gay"]

# loop through the target list to find the top 10 similarities for each target subreddit
for target in target_subreddit:
    query = subreddit_df[subreddit_df["name"] == target]
    # if the subreddit exists in the df, it will find the top 10 similar subreddits
    if not query.empty:
        query_index = query.index[0]
        # find the cosine similarity based on the subreddit titles
        cos_similarity = cosine_similarity(title_matrix[query_index], title_matrix).flatten()
        subreddit_df["similarity"] = cos_similarity

        # filter subreddit names that include inappropriate keywords
        filtered_df = subreddit_df[~subreddit_df["name"].str.lower().str.contains("|".join(exclude_keywords))]

        top_10 = filtered_df[filtered_df["name"] != target].sort_values(by = "similarity", ascending = False).head(10)
        
        print(f"\nTop 10 Subreddits Similar to r/{target} Using Cosine Similarity: ")
        for i, row in top_10.iterrows():
            print(f"r/{row["name"]} - {row["similarity"]:.5}")
    # if the subreddit does not exist in the df
    else:
        print(f"\nr/{target} is not found.")


Top 10 Subreddits Similar to r/cats Using Cosine Similarity: 
r/catsoncats - 0.93673
r/catsbeingcats - 0.8818
r/catshuggingcats - 0.77662
r/redditforcats - 0.73699
r/thingsforcats - 0.71229
r/blackcats - 0.70566
r/contentcats - 0.69171
r/catmemes - 0.68942
r/nameneko - 0.68723
r/allthingscats - 0.67637

Top 10 Subreddits Similar to r/dogs Using Cosine Similarity: 
r/checklad - 0.79985
r/dogsncharts - 0.79985
r/wetbear - 0.79985
r/dogtalk - 0.79985
r/birddogs - 0.50142
r/dogsondogs - 0.49375
r/bigdogs_littlecats - 0.45561
r/dogpictures - 0.44635
r/dogpics - 0.44635
r/picturesofservicedogs - 0.44635

Top 10 Subreddits Similar to r/birds Using Cosine Similarity: 
r/birdart - 0.5063
r/ornithologyuk - 0.50316
r/allthebirds - 0.47219
r/writings - 0.43784
r/pokimanepics - 0.43744
r/flowersandbirds - 0.42797
r/sciart - 0.3958
r/birdswithjobs - 0.39568
r/wehrmacht - 0.39441
r/nosh - 0.38599
