In [105]:
"""
Authored by Darren Colby and used for testing plotting functions
"""

import altair as alt
import pandas as pd
from prophet import Prophet
from itertools import product
import gensim.downloader as api
import json

In [2]:
df = pd.read_json("../../data/preprocessed_comments.json")
df.head()

Unnamed: 0,text,date,sentiment
0,litter mate sibling close blood relate sibling,2023-03-04 05:57:54+00:00,0.0
1,thank god bless,2023-03-02 11:33:47+00:00,0.7506
2,bless,2023-03-01 20:35:35+00:00,0.4215
3,absolutely awesome sail way,2023-03-01 14:33:12+00:00,0.659
4,extremely awesome people fur family touch warm...,2023-03-01 13:30:20+00:00,0.8883


In [3]:
def get_forecast(df: pd.DataFrame, lead: int):
    ts_df = df[["date", "x"]].rename(columns={"date": "ds", "x": "y"})
    ts_df["ds"] = ts_df.ds.dt.tz_localize(None)

    df2 = df.copy(deep=True)
    df2["forecast"] = "Actual"

    model = Prophet()
    model.fit(ts_df)
    future_df = model.make_future_dataframe(periods=lead)
    forecast = model.predict(future_df)[["ds", "yhat"]].rename(columns={"ds": "date", 
                                                                    "yhat": "x"})
    forecast["forecast"] = "Predicted"
    return pd.concat([forecast, df2[["date", "x", "forecast"]]])


def plot_ts(df: pd.DataFrame, title: str, y_col: str, y_title: str, caption: bool, 
            lead: int):
    df_to_plot = get_forecast(df, lead)

    # Have to convert to datetime and remove timezone for altair
    df_to_plot["date"] = pd.to_datetime(df_to_plot.date, utc=True)

    # The base plot
    ts_plot = alt.Chart(df_to_plot).mark_line().encode(
        alt.X(f"yearmonthdatehours(date):T", 
              title="",

              # Only displays the first and last ticks and labels
              # Found suggestion on Stack Overflow from user jakevdp
              # https://stackoverflow.com/questions/59699412/altair-display-all-axis-ticks-but-only-some-tick-labels
              axis=alt.Axis(tickCount=df_to_plot.shape[0])),
        alt.Y(y_col, 
              title=y_title),
        color=alt.Color(
            "forecast:N",
            legend=alt.Legend(title="")
        )
    ).properties(
        title=title
    )


    # Only adds caption for sentiment time series
    if caption:
        # The caption, which is not a property in the Chart class
        # Found solution on Stack Overflow by user jakevdp
        # https://stackoverflow.com/questions/57244390/how-to-add-a-subtitle-to-an-altair-generated-chart
        caption = alt.Chart(
            {"values": [{"text": 
                        "Sentiment ranges from -1 for most negative to +1 for most positive"}]}
        ).mark_text(align="left").encode(
            text="text:N"
        )

        final_plot = alt.vconcat(
            ts_plot,
            caption
        )

        return final_plot
    
    return ts_plot


In [4]:
def plot_sentiment_ts(df: pd.DataFrame, lead:int):
    df_copy = df.copy(deep=True)
    df_copy["x"] = df_copy.sentiment

    return plot_ts(df_copy, "Changes in Sentiment", "x", "Sentiment", True, lead)

plot_sentiment_ts(df, 3)

20:37:56 - cmdstanpy - INFO - Chain [1] start processing
20:37:56 - cmdstanpy - INFO - Chain [1] done processing


In [5]:
def plot_comment_ts(df: pd.DataFrame, lead: int):
    new_df = df.copy(deep=True)
    new_df = new_df.date.value_counts().rename_axis("date").reset_index(name="x")

    return plot_ts(new_df, "Comments accross time", "x", "Comments", False, lead)

plot_comment_ts(df, 3)

In [6]:
def plot_comment_cumsum_ts(df: pd.DataFrame, lead: int):
    new_df = df.copy(deep=True)
    new_df = new_df.date.value_counts().rename_axis("date").reset_index(name="x")
    new_df["date"] = pd.to_datetime(new_df.date)
    new_df.sort_values("date", inplace=True)
    new_df["x"] = new_df["x"].cumsum()

    return plot_ts(new_df, "Comments accross time", "x", "Comments", False, lead)

plot_comment_cumsum_ts(df, 3)

20:38:06 - cmdstanpy - INFO - Chain [1] start processing
20:38:06 - cmdstanpy - INFO - Chain [1] done processing


In [104]:
glove = api.load("glove-wiki-gigaword-50")
similarity_df = pd.read_json("../../data/similarity_data.json").fillna("")
all_comments = [" ".join(lst) for lst in  similarity_df.T.values.tolist()]
perm1, perm2 = [], []
similarities = []
id1, id2 = [], []

for p1, p2 in product(all_comments, all_comments):
    perm1.append(p1); perm2.append(p2)
    similarities.append(glove.n_similarity(p1.split(), p2.split()))

for (vid1, vid2) in product(similarity_df.columns, similarity_df.columns):
    id1.append(vid1); id2.append(vid2)

similarities_df = pd.DataFrame(list(zip(perm1, perm2, id1, id2, similarities)), 
                               columns=["p1", "p2", "vid1", "vid2", "Cosine similarity"])

base = alt.Chart(similarities_df).mark_rect().encode(
    x=alt.X('vid1:O',
            title="Video ID"),
    y=alt.Y('vid2:O',
            title="Video ID"),
    color='Cosine similarity:Q',
).properties(
    height=300,
    width=300
)

base

In [134]:
def plot_similar_videos(df: pd.DataFrame):
    #df = pd.read_json("../../data/transcript_data.json")
    df = pd.DataFrame(transcripts.apply(lambda x: " ".join(x))).rename_axis("vid")
    df.reset_index(inplace=True)
    df.columns = ["vid", "text"]
    own, competitor = df.iloc[0, 1], df.drop(index=[0])
    competitor["Cosine similarity"] = competitor.apply(lambda x: 
                                                       glove.n_similarity(x.text.split(), 
                                                                          own), axis=1)

    chart = alt.Chart(competitor).mark_bar().encode(
        x="Cosine similarity",
        y=alt.Y("vid",
            sort="-x",
        title="Video ID")
    ).properties(
        title="Competitor video similarities"
    )

    return chart