# Sci-Fi IRL #1: Technology Terminology Velocity

### A Data Storytelling Project by Tobias Reaper

### ---- Datalogue 009 ----

---
---

### Imports and Configuration

In [1]:
# Three Musketeers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# For using the API
import requests

In [28]:
# More advanced vizualizations with Bokeh
from bokeh.plotting import figure, output_file, output_notebook, show
from bokeh.layouts import column
from bokeh.models.glyphs import Patches
from bokeh.models.formatters import NumeralTickFormatter

In [3]:
# Import color library
import colorcet as cc

In [4]:
# Define color palette
palette = [cc.bkr[i*15] for i in range(17)]

In [5]:
# Set pandas display options to allow for more columns and rows
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 500)

---

### Functions

In [6]:
def pushshift_api_request(query, subreddit, frequency="month", aggs="created_utc"):
    """
    Returns the JSON response of a PushShift API aggregate comment search as a Python dictionary.
    
    Note: if you're reading this note, that means that this function is still only written
    with the intention of automating a specific set of actions for a specific project.
    
    ---- Arguments ----
    query: (str) keyword to search.
    subreddit: (str) subreddit name
    frequency: (str) set the size of the time buckets.
    aggs: (str) aggregate function name. Default is "created_utc".
    (For more information, read the PushShift API Documentation.)
    -------------------
    """
    
    # Build the query url based on endpoints and parameters 
    url = f"https://api.pushshift.io/reddit/search/comment/?q={query}&subreddit={subreddit}&aggs={aggs}&frequency={frequency}&size=100"
    
    # Send the request and save the response into the response object
    response = requests.get(url)
    
    # Check the response; stop execution if failed
    assert response.status_code == 200
    
    # Parse the JSON into a Python dictionary
    # and return it for further processing
    return response.json()

In [7]:
def create_df(data, keyword, frequency="month"):
    """
    Returns cleaned Pandas DataFrame of keyword frequency over time, given correctly-formatted Python dictionary.
    Renames the frequency column to keyword; converts month to datetime.
    
    Note: if you're reading this note, that means that this function is still only written
    with the intention of automating a specific set of actions for a specific project.
    
    ---- Arguments ----
    data: (dict) Python dictionary converted from JSON API response.
    keyword: (str) the keyword that was queried.
    time_bucket: (str) size of time buckets, which is also the name of the resulting DataFrame column. Defaults to "month".
    -------------------
    """
    
    # Convert the python object into a pandas dataframe
    df = pd.DataFrame(data["aggs"]["created_utc"])

    # Convert "key" into a datetime column
    df["key"] = pd.to_datetime(df["key"], unit="s", origin="unix")

    # Rename "key" to reflect the fact that it is the beginning of the time bucket
    df = df.rename(mapper={"key": frequency, "doc_count": keyword}, axis="columns")
    
    # Return the DataFrame
    return df

In [8]:
def comments_df(data):
    """
    Returns Reddit comments in Pandas DataFrame, given the correctly-formatted Python dictionary.
    
    Note: if you're reading this note, that means that this function is still only written
    with the intention of automating a specific set of actions for a specific project.
    
    ---- Arguments ----
    data: (dict) Python dictionary converted from JSON API response.
    -------------------
    """
    
    # Convert the comments into a pandas dataframe
    df = pd.DataFrame(data["data"])

    # Return the DataFrame
    return df

In [9]:
def df_to_csv(data, filename):
    """
    Basically just a wrapper around the Pandas `.to_csv()` method,
    created to standardize the inputs and outputs.
    
    ---- Arguments ----
    data: (pd.DataFrame) Pandas DataFrame to be saved as a csv.
    filepath: (str) name or path of the file to be saved.
    -------------------
    """
    
    # Saves the DataFrame to csv
    data.to_csv(path_or_buf=filename, index=False)
    
    # And that's it, folks!

In [14]:
def reddit_data_setter(keywords, subreddits, csv=False, frequency="month", aggs="created_utc"):
    """
    Creates two DataFrames that hold combined data of all combinations of keywords / subreddits.
    
    Note: if you're reading this note, that means that this function is still only written
    with the intention of automating a specific set of actions for a specific project.
    
    ---- Arguments ----
    keywords: (list) keyword(s) to search.
    subreddits: (list) name of subreddit(s) to include.
    csv: (bool) if True, save the resulting dataframes as csv file.
    frequency: (str) set the size of the time buckets.
    aggs: (str) aggregate function name. Default is "created_utc".
    (For more information, read the PushShift API Documentation.)
    -------------------
    """
    from time import sleep

    comment_df_list = []  # Empty list to hold comment dataframes
    word_df_list = []  # Empty list to hold monthly word count dataframes
    df_comm = pd.DataFrame()  # Empty dataframe for comment data
    df_main = pd.DataFrame()  # Empty dataframe for keyword counts

    # Create the "month" (datetime) column - to be used when joining
    df_main["month"] = pd.date_range(start="2005-01-01", end="2019-09-01", freq="MS")
    
    # Run query for individual keywords on each subreddit
    # Subreddit (outer) -> keyword (inner) = all keywords in one subreddit at a time
    for subreddit in subreddits:
        for word in keywords:
            # Create unique column name for each subreddit / word combo
            col_name = f"{subreddit}_{word.replace(' ', '')}"
            
            # Indicates current subreddit / keyword
            start = f"{col_name}..."
            print(start)
            sleep(0.5)  # Add sleep time to reduce API load 

            # Make request and convert response to dictionary
            dictionary = pushshift_api_request(word, subreddit)

            # Append aggs word count df to word_df_list
            word_df_list.append(create_df(dictionary, col_name))

            # Append comments df to comment_df_list
            comment_df_list.append(comments_df(dictionary))
            
            sleep(0.5)  # More sleep to reduce API load
            sleep(0.5)
    
    # Set "month" as index in order to concatenate list of dataframes
    df_main = pd.concat([df.set_index("month") for df in word_df_list],
                        axis=1, join="outer").reset_index()
    
    # Concatenate comment_df_list dataframes
    df_comm = pd.concat(comment_df_list, axis=0, sort=False,
                        join="outer", ignore_index=True)
        
    # If csv parameter is set to True, save datasets to filesystem as csv
    if csv:
        df_to_csv(df_main, f"data/{keywords[0]}-monthly.csv")
        df_to_csv(df_comm, f"data/{keywords[0]}-comments.csv")
    
    # Return df_main, df_comm, respectively
    return df_main, df_comm

---
---

## Term Velocity: Algorithm

The velocity of the term "algorithm" in each of the target subreddits.  
This time also with the total number of comments made in each subreddit over the same period of time.

In [15]:
# Define keywords and subreddits as python lists
words = [
    "algorithm",
    "",  # Blank keyword results in total number of comments for use in normalization
]

subs = [
    "Futurology",
    "technology",
    "science",
    "askscience",
    "gadgets",
    "books",
    "scifi",
    "movies",
    "gaming",
    "television",
    "news",
    "worldnews",
    "politics",
    "philosophy",
    "AskReddit",
    "todayilearned",
    "explainlikeimfive",
]

In [16]:
# Run the function to create and save the dataset
df_main, df_comm = reddit_data_setter(words, subs, True)

Futurology_algorithm...
Futurology_...
technology_algorithm...
technology_...
science_algorithm...
science_...
askscience_algorithm...
askscience_...
gadgets_algorithm...
gadgets_...
books_algorithm...
books_...
scifi_algorithm...
scifi_...
movies_algorithm...
movies_...
gaming_algorithm...
gaming_...
television_algorithm...
television_...
news_algorithm...
news_...
worldnews_algorithm...
worldnews_...
politics_algorithm...
politics_...
philosophy_algorithm...
philosophy_...
AskReddit_algorithm...
AskReddit_...
todayilearned_algorithm...
todayilearned_...
explainlikeimfive_algorithm...
explainlikeimfive_...


In [17]:
# Take a look to be sure it worked as expected
print(df_main.shape)
df_main.head()

(156, 35)


Unnamed: 0,month,Futurology_algorithm,Futurology_,technology_algorithm,technology_,science_algorithm,science_,askscience_algorithm,askscience_,gadgets_algorithm,gadgets_,books_algorithm,books_,scifi_algorithm,scifi_,movies_algorithm,movies_,gaming_algorithm,gaming_,television_algorithm,television_,news_algorithm,news_,worldnews_algorithm,worldnews_,politics_algorithm,politics_,philosophy_algorithm,philosophy_,AskReddit_algorithm,AskReddit_,todayilearned_algorithm,todayilearned_,explainlikeimfive_algorithm,explainlikeimfive_
0,2006-10-01,,,,,1,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,1,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,0,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,2,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-02-01,,,,,2,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,


---

## Data Normalization

In order to normalize the data, I divide the number of comments containing the keywords by the total number of comments made in that subreddit. This way, the number will reflect a percentage, allowing me to visualize all of the subreddits on a single graph. Furthermore, this teases out the actual relationship of the velocity to time instead of it being mixed up with the overall growth in the subreddit.

> Step 1. Loop through each subreddit dividing the keyword column by the blank column.

In [18]:
# Loop through each subreddit dividing the keyword column by the blank column
for sub in subs:
    df_main[f"{sub}_norm"] = df_main[f"{sub}_{words[0]}"] / df_main[f"{sub}_"]

In [19]:
df_main.head()

Unnamed: 0,month,Futurology_algorithm,Futurology_,technology_algorithm,technology_,science_algorithm,science_,askscience_algorithm,askscience_,gadgets_algorithm,gadgets_,books_algorithm,books_,scifi_algorithm,scifi_,movies_algorithm,movies_,gaming_algorithm,gaming_,television_algorithm,television_,news_algorithm,news_,worldnews_algorithm,worldnews_,politics_algorithm,politics_,philosophy_algorithm,philosophy_,AskReddit_algorithm,AskReddit_,todayilearned_algorithm,todayilearned_,explainlikeimfive_algorithm,explainlikeimfive_,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
0,2006-10-01,,,,,1,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001779,,,,,,,,,,,,,,
1,2006-11-01,,,,,1,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000556,,,,,,,,,,,,,,
2,2006-12-01,,,,,0,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,
3,2007-01-01,,,,,2,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000991,,,,,,,,,,,,,,
4,2007-02-01,,,,,2,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000768,,,,,,,,,,,,,,


---

### Visualizations

In [21]:
# Color assignments
subs_colors = {}

for i in range(len(subs)):
    subs_colors[f"{subs[i]}"] = f"{palette[i]}"

In [None]:
# Output to current notebook
output_notebook()
output_file(f"viz/{words[0]}-velocity-viz-norm.html")

p = figure(title=f"Percentage of comments that mention '{words[0]}'",
                     plot_width=800, plot_height=800, 
                     x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))

for sub in subs_colors:
    p.line(df_main["month"], df_main[f"{sub}_norm"], legend=f"r/{sub}",
                     line_width=2, line_color=f"{subs_colors[sub]}")

# Show the results
show(p)

### Separate plots for each subreddit

In [None]:
# Output to current notebook
output_notebook()
output_file(f"viz/{words[0]}-velocity-viz.html")

p = {}  # dict to hold plots
p_names = []  # list for plot names

for sub in subs_colors:
    p[f"{sub}"] = figure(title=f"Percentage of comments that mention '{words[0]}' in r/{sub}",
                         plot_width=1000, plot_height=200, 
                         x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))
    p[f"{sub}"].yaxis.formatter = NumeralTickFormatter(format="0.00%")
    p[f"{sub}"].line(df_main["month"], df_main[f"{sub}_norm"], line_width=2, line_color=f"{subs_colors[sub]}")
    p_names.append(p[f"{sub}"])
    


# Show the results
show(column(p_names))

---
---

## Term Velocity: AI

The normalized velocity of the term "AI" in each of the target subreddits.

In [33]:
# Define keywords and subreddits as python lists
words = [
    "AI",
    "",  # Blank keyword results in total number of comments
]

subs = [
    "Futurology",
    "technology",
    "science",
    "askscience",
    "gadgets",
    "books",
    "scifi",
    "movies",
    "gaming",
    "television",
    "news",
    "worldnews",
    "politics",
    "philosophy",
    "AskReddit",
    "todayilearned",
    "explainlikeimfive",
]

In [34]:
# Run the function to create and save the dataset
df_main, df_comm = reddit_data_setter(words, subs, True)

Futurology_AI...
Futurology_...
technology_AI...
technology_...
science_AI...
science_...
askscience_AI...
askscience_...
gadgets_AI...
gadgets_...
books_AI...
books_...
scifi_AI...
scifi_...
movies_AI...
movies_...
gaming_AI...
gaming_...
television_AI...
television_...
news_AI...
news_...
worldnews_AI...
worldnews_...
politics_AI...
politics_...
philosophy_AI...
philosophy_...
AskReddit_AI...
AskReddit_...
todayilearned_AI...
todayilearned_...
explainlikeimfive_AI...
explainlikeimfive_...


In [35]:
# Take a look to be sure it worked as expected
print(df_main.shape)
df_main.head()

(156, 35)


Unnamed: 0,month,Futurology_AI,Futurology_,technology_AI,technology_,science_AI,science_,askscience_AI,askscience_,gadgets_AI,gadgets_,books_AI,books_,scifi_AI,scifi_,movies_AI,movies_,gaming_AI,gaming_,television_AI,television_,news_AI,news_,worldnews_AI,worldnews_,politics_AI,politics_,philosophy_AI,philosophy_,AskReddit_AI,AskReddit_,todayilearned_AI,todayilearned_,explainlikeimfive_AI,explainlikeimfive_
0,2006-10-01,,,,,1,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,3,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,0,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-02-01,,,,,5,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,


---

## Data Normalization

To normalize the data I used the simple method of dividing the number of comments containing the keywords by the total number of comments made in that subreddit.

In [36]:
# Loop through each subreddit dividing the keyword column by the blank column
for sub in subs:
    df_main[f"{sub}_norm"] = df_main[f"{sub}_{words[0]}"] / df_main[f"{sub}_"]

In [37]:
df_main.head()

Unnamed: 0,month,Futurology_AI,Futurology_,technology_AI,technology_,science_AI,science_,askscience_AI,askscience_,gadgets_AI,gadgets_,books_AI,books_,scifi_AI,scifi_,movies_AI,movies_,gaming_AI,gaming_,television_AI,television_,news_AI,news_,worldnews_AI,worldnews_,politics_AI,politics_,philosophy_AI,philosophy_,AskReddit_AI,AskReddit_,todayilearned_AI,todayilearned_,explainlikeimfive_AI,explainlikeimfive_,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
0,2006-10-01,,,,,1,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001779,,,,,,,,,,,,,,
1,2006-11-01,,,,,3,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001669,,,,,,,,,,,,,,
2,2006-12-01,,,,,0,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,
3,2007-01-01,,,,,0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,
4,2007-02-01,,,,,5,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001919,,,,,,,,,,,,,,


---

## Visualizations

In [38]:
# Color assignments
subs_colors = {}

for i in range(len(subs)):
    subs_colors[f"{subs[i]}"] = f"{palette[i]}"

### Single Plot for Keyword

In [None]:
# Output to current notebook
output_notebook()
# And save to file
output_file(f"viz/{words[0]}-velocity-viz-norm.html")

p = figure(title=f"Percentage of comments that mention '{words[0]}'",
                     plot_width=800, plot_height=800, 
                     x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))

p.yaxis.formatter = NumeralTickFormatter(format="0.00%")

for sub in subs_colors:
    p.line(df_main["month"], df_main[f"{sub}_norm"], legend=f"r/{sub}",
                     line_width=2, line_color=f"{subs_colors[sub]}")

# Show the results
show(p)

### Separate plots for each subreddit

In [None]:
# Output to current notebook
output_notebook()
output_file(f"viz/{words[0]}-velocity-viz.html")

p = {}  # dict to hold plots
p_names = []  # list for plot names

for sub in subs_colors:
    p[f"{sub}"] = figure(title=f"Percentage of comments that mention '{words[0]}' in r/{sub}",
                         plot_width=1000, plot_height=200, 
                         x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))
    p[f"{sub}"].yaxis.formatter = NumeralTickFormatter(format="0.00%")
    p[f"{sub}"].line(df_main["month"], df_main[f"{sub}_norm"], line_width=2, line_color=f"{subs_colors[sub]}")
    p_names.append(p[f"{sub}"])

# Show the results
show(column(p_names))

---

## Standardizing the Data

With the confounding variable of overall growth teased out, I'm now going to try standardizing the data using scikit-learn.

> Method 1: `sklearn.preprocessing.MinMaxScaler`

In [43]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
# All of the columns are floats except for the original science columns
# This is actually quite alright, as I will only be using the _norm columns
df_main.dtypes

Clean up dataset to only have the _norm features

In [45]:
# Create list of of columns to keep
norm_cols = ["month"]

for sub in subs:
    norm_cols.append(f"{sub}_norm")
    
print(norm_cols)

['month', 'Futurology_norm', 'technology_norm', 'science_norm', 'askscience_norm', 'gadgets_norm', 'books_norm', 'scifi_norm', 'movies_norm', 'gaming_norm', 'television_norm', 'news_norm', 'worldnews_norm', 'politics_norm', 'philosophy_norm', 'AskReddit_norm', 'todayilearned_norm', 'explainlikeimfive_norm']


In [46]:
# Keep only the columns in list above
df = df_main[norm_cols]

print(df.shape)
df.head()

(156, 18)


Unnamed: 0,month,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
0,2006-10-01,,,0.001779,,,,,,,,,,,,,,
1,2006-11-01,,,0.001669,,,,,,,,,,,,,,
2,2006-12-01,,,0.0,,,,,,,,,,,,,,
3,2007-01-01,,,0.0,,,,,,,,,,,,,,
4,2007-02-01,,,0.001919,,,,,,,,,,,,,,


In [47]:
df.describe()

Unnamed: 0,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
count,94.0,138.0,156.0,114.0,141.0,126.0,139.0,129.0,144.0,107.0,134.0,138.0,145.0,140.0,138.0,121.0,99.0
mean,0.021562,0.001985,0.001176,0.000932,0.000793,0.000439,0.004638,0.000476,0.001882,0.000241,0.000174,0.00039,0.000111,0.005337,0.000251,0.000285,0.000759
std,0.010795,0.002676,0.001885,0.001642,0.001067,0.00038,0.003984,0.000368,0.000928,0.000196,0.00016,0.000488,7e-05,0.004914,0.000122,0.000218,0.000522
min,0.006932,0.000166,0.0,0.0,0.0,0.0,0.0,0.0,0.000562,0.0,0.0,0.0,0.0,0.000329,7.6e-05,0.0,6.6e-05
25%,0.015415,0.00056,0.000487,0.000273,5.7e-05,0.000191,0.0022,0.00028,0.001236,9.9e-05,4.5e-05,0.000125,6.2e-05,0.002114,0.000142,0.000139,0.000409
50%,0.020163,0.001135,0.000723,0.00053,0.000383,0.000321,0.003867,0.000414,0.001614,0.000208,0.000129,0.000219,9.6e-05,0.003781,0.00023,0.000243,0.00063
75%,0.024974,0.003045,0.001179,0.000848,0.000958,0.000559,0.005821,0.000538,0.002285,0.000352,0.000286,0.000498,0.000145,0.00718,0.000343,0.000362,0.000941
max,0.09893,0.026884,0.019738,0.012088,0.005083,0.001931,0.033633,0.00287,0.006679,0.000949,0.000885,0.003315,0.000364,0.027176,0.000637,0.00126,0.002975


In [None]:
# Standardize the columns using minmax
for sub in subs:
    

## Going to wait on the standardizations

---
---

## Term Velocity: AR

The normalized velocity of the term "AR" in each of the target subreddits.

In [48]:
# Define keywords and subreddits as python lists
words = [
    "AR",
    "",  # Blank keyword results in total number of comments
]

subs = [
    "Futurology",
    "technology",
    "science",
    "askscience",
    "gadgets",
    "books",
    "scifi",
    "movies",
    "gaming",
    "television",
    "news",
    "worldnews",
    "politics",
    "philosophy",
    "AskReddit",
    "todayilearned",
    "explainlikeimfive",
]

In [49]:
# Run the function to create and save the dataset
df_main, df_comm = reddit_data_setter(words, subs, True)

Futurology_AR...
Futurology_...
technology_AR...
technology_...
science_AR...
science_...
askscience_AR...
askscience_...
gadgets_AR...
gadgets_...
books_AR...
books_...
scifi_AR...
scifi_...
movies_AR...
movies_...
gaming_AR...
gaming_...
television_AR...
television_...
news_AR...
news_...
worldnews_AR...
worldnews_...
politics_AR...
politics_...
philosophy_AR...
philosophy_...
AskReddit_AR...
AskReddit_...
todayilearned_AR...
todayilearned_...
explainlikeimfive_AR...
explainlikeimfive_...


In [50]:
# Take a look to be sure it worked as expected
print(df_main.shape)
df_main.head()

(156, 35)


Unnamed: 0,month,Futurology_AR,Futurology_,technology_AR,technology_,science_AR,science_,askscience_AR,askscience_,gadgets_AR,gadgets_,books_AR,books_,scifi_AR,scifi_,movies_AR,movies_,gaming_AR,gaming_,television_AR,television_,news_AR,news_,worldnews_AR,worldnews_,politics_AR,politics_,philosophy_AR,philosophy_,AskReddit_AR,AskReddit_,todayilearned_AR,todayilearned_,explainlikeimfive_AR,explainlikeimfive_
0,2006-10-01,,,,,1,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,0,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,0,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-02-01,,,,,0,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,


---

## Data Normalization

To normalize the data I used the simple method of dividing the number of comments containing the keywords by the total number of comments made in that subreddit.

In [51]:
# Loop through each subreddit dividing the keyword column by the blank column
for sub in subs:
    df_main[f"{sub}_norm"] = df_main[f"{sub}_{words[0]}"] / df_main[f"{sub}_"]

In [52]:
df_main.head()

Unnamed: 0,month,Futurology_AR,Futurology_,technology_AR,technology_,science_AR,science_,askscience_AR,askscience_,gadgets_AR,gadgets_,books_AR,books_,scifi_AR,scifi_,movies_AR,movies_,gaming_AR,gaming_,television_AR,television_,news_AR,news_,worldnews_AR,worldnews_,politics_AR,politics_,philosophy_AR,philosophy_,AskReddit_AR,AskReddit_,todayilearned_AR,todayilearned_,explainlikeimfive_AR,explainlikeimfive_,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
0,2006-10-01,,,,,1,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001779,,,,,,,,,,,,,,
1,2006-11-01,,,,,0,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,
2,2006-12-01,,,,,0,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,
3,2007-01-01,,,,,0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,
4,2007-02-01,,,,,0,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,


---

## Visualizations

In [38]:
# Color assignments
subs_colors = {}

for i in range(len(subs)):
    subs_colors[f"{subs[i]}"] = f"{palette[i]}"

### Single Plot for Keyword

In [None]:
# Output to current notebook
output_notebook()
# And save to file
output_file(f"viz/{words[0]}-velocity-viz-norm.html")

p = figure(title=f"Percentage of comments that mention '{words[0]}'",
                     plot_width=800, plot_height=800, 
                     x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))

p.yaxis.formatter = NumeralTickFormatter(format="0.00%")

for sub in subs_colors:
    p.line(df_main["month"], df_main[f"{sub}_norm"], legend=f"r/{sub}",
                     line_width=2, line_color=f"{subs_colors[sub]}")

# Show the results
show(p)

### Separate plots for each subreddit

In [None]:
# Output to current notebook
output_notebook()
output_file(f"viz/{words[0]}-velocity-viz.html")

p = {}  # dict to hold plots
p_names = []  # list for plot names

for sub in subs_colors:
    p[f"{sub}"] = figure(title=f"Percentage of comments that mention '{words[0]}' in r/{sub}",
                         plot_width=1000, plot_height=200, 
                         x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))
    p[f"{sub}"].yaxis.formatter = NumeralTickFormatter(format="0.00%")
    p[f"{sub}"].line(df_main["month"], df_main[f"{sub}_norm"], line_width=2, line_color=f"{subs_colors[sub]}")
    p_names.append(p[f"{sub}"])

# Show the results
show(column(p_names))

---
---

## Term Velocity: Big Data

The normalized velocity of the term "big data" in each of the target subreddits.

In [55]:
# Define keywords and subreddits as python lists
words = [
    "big data",
    "",  # Blank keyword results in total number of comments
]

subs = [
    "Futurology",
    "technology",
    "science",
    "askscience",
    "gadgets",
    "books",
    "scifi",
    "movies",
    "gaming",
    "television",
    "news",
    "worldnews",
    "politics",
    "philosophy",
    "AskReddit",
    "todayilearned",
    "explainlikeimfive",
]

In [56]:
# Run the function to create and save the dataset
df_main, df_comm = reddit_data_setter(words, subs, True)

Futurology_bigdata...
Futurology_...
technology_bigdata...
technology_...
science_bigdata...
science_...
askscience_bigdata...
askscience_...
gadgets_bigdata...
gadgets_...
books_bigdata...
books_...
scifi_bigdata...
scifi_...
movies_bigdata...
movies_...
gaming_bigdata...
gaming_...
television_bigdata...
television_...
news_bigdata...
news_...
worldnews_bigdata...
worldnews_...
politics_bigdata...
politics_...
philosophy_bigdata...
philosophy_...
AskReddit_bigdata...
AskReddit_...
todayilearned_bigdata...
todayilearned_...
explainlikeimfive_bigdata...
explainlikeimfive_...


In [57]:
# Take a look to be sure it worked as expected
print(df_main.shape)
df_main.head()

(156, 35)


Unnamed: 0,month,Futurology_bigdata,Futurology_,technology_bigdata,technology_,science_bigdata,science_,askscience_bigdata,askscience_,gadgets_bigdata,gadgets_,books_bigdata,books_,scifi_bigdata,scifi_,movies_bigdata,movies_,gaming_bigdata,gaming_,television_bigdata,television_,news_bigdata,news_,worldnews_bigdata,worldnews_,politics_bigdata,politics_,philosophy_bigdata,philosophy_,AskReddit_bigdata,AskReddit_,todayilearned_bigdata,todayilearned_,explainlikeimfive_bigdata,explainlikeimfive_
0,2006-10-01,,,,,,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,2.0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-02-01,,,,,1.0,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,


---

## Data Normalization

To normalize the data I used the simple method of dividing the number of comments containing the keywords by the total number of comments made in that subreddit.

In [59]:
# Loop through each subreddit dividing the keyword column by the blank column
for sub in subs:
    df_main[f"{sub}_norm"] = df_main[f"{sub}_{words[0].replace(' ', '')}"] / df_main[f"{sub}_"]

In [60]:
df_main.head()

Unnamed: 0,month,Futurology_bigdata,Futurology_,technology_bigdata,technology_,science_bigdata,science_,askscience_bigdata,askscience_,gadgets_bigdata,gadgets_,books_bigdata,books_,scifi_bigdata,scifi_,movies_bigdata,movies_,gaming_bigdata,gaming_,television_bigdata,television_,news_bigdata,news_,worldnews_bigdata,worldnews_,politics_bigdata,politics_,philosophy_bigdata,philosophy_,AskReddit_bigdata,AskReddit_,todayilearned_bigdata,todayilearned_,explainlikeimfive_bigdata,explainlikeimfive_,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
0,2006-10-01,,,,,,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,2.0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000991,,,,,,,,,,,,,,
4,2007-02-01,,,,,1.0,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000384,,,,,,,,,,,,,,


---

## Visualizations

In [38]:
# Color assignments
subs_colors = {}

for i in range(len(subs)):
    subs_colors[f"{subs[i]}"] = f"{palette[i]}"

### Single Plot for Keyword

In [None]:
# Output to current notebook
output_notebook()
# And save to file
output_file(f"viz/{words[0]}-velocity-viz-norm.html")

p = figure(title=f"Percentage of comments that mention '{words[0]}'",
                     plot_width=800, plot_height=800, 
                     x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))

p.yaxis.formatter = NumeralTickFormatter(format="0.00%")

for sub in subs_colors:
    p.line(df_main["month"], df_main[f"{sub}_norm"], legend=f"r/{sub}",
                     line_width=2, line_color=f"{subs_colors[sub]}")

# Show the results
show(p)

### Separate plots for each subreddit

In [None]:
# Output to current notebook
output_notebook()
output_file(f"viz/{words[0]}-velocity-viz.html")

p = {}  # dict to hold plots
p_names = []  # list for plot names

for sub in subs_colors:
    p[f"{sub}"] = figure(title=f"Percentage of comments that mention '{words[0]}' in r/{sub}",
                         plot_width=1000, plot_height=200, 
                         x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))
    p[f"{sub}"].yaxis.formatter = NumeralTickFormatter(format="0.00%")
    p[f"{sub}"].line(df_main["month"], df_main[f"{sub}_norm"], line_width=2, line_color=f"{subs_colors[sub]}")
    p_names.append(p[f"{sub}"])

# Show the results
show(column(p_names))

---
---

## Term Velocity: Automation

The normalized velocity of the term "automation" in each of the target subreddits.

In [63]:
# Define keywords and subreddits as python lists
words = [
    "automation",
    "",  # Blank keyword results in total number of comments
]

subs = [
    "Futurology",
    "technology",
    "science",
    "askscience",
    "gadgets",
    "books",
    "scifi",
    "movies",
    "gaming",
    "television",
    "news",
    "worldnews",
    "politics",
    "philosophy",
    "AskReddit",
    "todayilearned",
    "explainlikeimfive",
]

In [64]:
# Run the function to create and save the dataset
df_main, df_comm = reddit_data_setter(words, subs, True)

Futurology_automation...
Futurology_...
technology_automation...
technology_...
science_automation...
science_...
askscience_automation...
askscience_...
gadgets_automation...
gadgets_...
books_automation...
books_...
scifi_automation...
scifi_...
movies_automation...
movies_...
gaming_automation...
gaming_...
television_automation...
television_...
news_automation...
news_...
worldnews_automation...
worldnews_...
politics_automation...
politics_...
philosophy_automation...
philosophy_...
AskReddit_automation...
AskReddit_...
todayilearned_automation...
todayilearned_...
explainlikeimfive_automation...
explainlikeimfive_...


In [65]:
# Take a look to be sure it worked as expected
print(df_main.shape)
df_main.head()

(156, 35)


Unnamed: 0,month,Futurology_automation,Futurology_,technology_automation,technology_,science_automation,science_,askscience_automation,askscience_,gadgets_automation,gadgets_,books_automation,books_,scifi_automation,scifi_,movies_automation,movies_,gaming_automation,gaming_,television_automation,television_,news_automation,news_,worldnews_automation,worldnews_,politics_automation,politics_,philosophy_automation,philosophy_,AskReddit_automation,AskReddit_,todayilearned_automation,todayilearned_,explainlikeimfive_automation,explainlikeimfive_
0,2006-10-01,,,,,,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-02-01,,,,,,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,


---

## Data Normalization

To normalize the data I used the simple method of dividing the number of comments containing the keywords by the total number of comments made in that subreddit.

In [66]:
# Loop through each subreddit dividing the keyword column by the blank column
for sub in subs:
    df_main[f"{sub}_norm"] = df_main[f"{sub}_{words[0].replace(' ', '')}"] / df_main[f"{sub}_"]

In [67]:
df_main.head()

Unnamed: 0,month,Futurology_automation,Futurology_,technology_automation,technology_,science_automation,science_,askscience_automation,askscience_,gadgets_automation,gadgets_,books_automation,books_,scifi_automation,scifi_,movies_automation,movies_,gaming_automation,gaming_,television_automation,television_,news_automation,news_,worldnews_automation,worldnews_,politics_automation,politics_,philosophy_automation,philosophy_,AskReddit_automation,AskReddit_,todayilearned_automation,todayilearned_,explainlikeimfive_automation,explainlikeimfive_,Futurology_norm,technology_norm,science_norm,askscience_norm,gadgets_norm,books_norm,scifi_norm,movies_norm,gaming_norm,television_norm,news_norm,worldnews_norm,politics_norm,philosophy_norm,AskReddit_norm,todayilearned_norm,explainlikeimfive_norm
0,2006-10-01,,,,,,562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2006-11-01,,,,,,1798,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2006-12-01,,,,,,1848,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-01-01,,,,,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-02-01,,,,,,2605,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


---

## Visualizations

In [38]:
# Color assignments
subs_colors = {}

for i in range(len(subs)):
    subs_colors[f"{subs[i]}"] = f"{palette[i]}"

### Single Plot for Keyword

In [None]:
# Output to current notebook
output_notebook()
# And save to file
output_file(f"viz/{words[0]}-velocity-viz-norm.html")

p = figure(title=f"Percentage of comments that mention '{words[0]}'",
                     plot_width=800, plot_height=800, 
                     x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))

p.yaxis.formatter = NumeralTickFormatter(format="0.00%")

for sub in subs_colors:
    p.line(df_main["month"], df_main[f"{sub}_norm"], legend=f"r/{sub}",
                     line_width=2, line_color=f"{subs_colors[sub]}")

# Show the results
show(p)

### Separate plots for each subreddit

In [None]:
# Output to current notebook
output_notebook()
output_file(f"viz/{words[0]}-velocity-viz.html")

p = {}  # dict to hold plots
p_names = []  # list for plot names

for sub in subs_colors:
    p[f"{sub}"] = figure(title=f"Percentage of comments that mention '{words[0]}' in r/{sub}",
                         plot_width=1000, plot_height=200, 
                         x_axis_type="datetime", x_range=(df_main.iloc[14][0], df_main.iloc[-1][0]))
    p[f"{sub}"].yaxis.formatter = NumeralTickFormatter(format="0.00%")
    p[f"{sub}"].line(df_main["month"], df_main[f"{sub}_norm"], line_width=2, line_color=f"{subs_colors[sub]}")
    p_names.append(p[f"{sub}"])

# Show the results
show(column(p_names))