### Sentiments and subreddit evolution

In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from textblob import TextBlob
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.linalg import Vectors
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import time
from IPython.display import display, clear_output
from collections import Counter

while True:
    
    # # Load the sentiment data
    with open('sentiment.json', 'r') as f:
        sentiment_data = json.load(f)

    # # Define the data for the sentiment analysis plot
    for i in range(4):
        # Get the sentiment values for this cluster
        sentiment_values = sentiment_data[str(i)]

        # Divide the first four values by the fifth value
        denominator = sentiment_values[4]
        if denominator != 0:
            sentiment_values = [v / denominator for v in sentiment_values[:4]]
        else:
            sentiment_values = [0, 0, 0, 0]

        sentiment_data[str(i)] = sentiment_values


    # # Load location data
    with open('subreddit.json', 'r') as f:
        location_data = json.load(f)

    # Compute the most frequent subreddit name for each cluster
    location_values = {}
    for k, v in location_data.items():
        if len(v) > 0:
            location_values[k] = Counter(v).most_common(1)[0][0]
        else:
            location_values[k] = ""

    clear_output(wait=True)
    # Create a grid of subplots
    fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(10, 10))

    # Plot the sentiment analysis in the top row of subplots
    for i, ax in enumerate(axs[0]):
        ax.set_title(f"Cluster {i}")
        ax.set_xlabel("Sentiments")
        ax.set_ylabel("Percentage")
        sns.barplot(x=['ti_pol', 'ti_sub', 'te_pol', 'te_sub'], y=sentiment_data[str(i)], ax=ax)

    # # Plot the most common subreddit in the bottom row of subplots
    # for i, ax in enumerate(axs[1]):
    #     ax.set_title(f"Cluster {i}")
    #     ax.set_xlabel("Subreddit")
    #     ax.set_ylabel("Count")
    #     sns.countplot(x=location_data[str(i)], ax=ax)
    
    # Plot the most common subreddit in the bottom row of subplots
    for i, ax in enumerate(axs[1]):
        ax.set_title(f"Cluster {i}")
        ax.set_xlabel("Subreddit")
        ax.set_ylabel("Count")
        sns.countplot(x=location_data[str(i)], ax=ax)

    # Add a title for the whole plot
    fig.suptitle("Sentiment Analysis and Most Common Subreddit by Cluster")

    # Show the plot
    plt.show()

KeyboardInterrupt: 

In [None]:
# if it suddenly stops working, try to run it again (it might have been a bad timing of opening the viz.json while it was being truncated)

### Cluster sizes evolution

In [2]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from textblob import TextBlob
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.linalg import Vectors
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import time
from IPython.display import display, clear_output
from collections import Counter

df = pd.DataFrame(columns=['cluster', 'size'])

while True:
    try:
        with open('viz.json', 'r') as f:
            data = json.load(f)
    except:
        continue

    df2 = pd.DataFrame({'cluster': data.keys(), 'size': data.values()})  
    clear_output(wait=True)
    sns.barplot(x='cluster', y='size', data=pd.concat([df, df2]))
    plt.show()

    time.sleep(1)

KeyboardInterrupt: 

<Figure size 640x480 with 0 Axes>