In [36]:
"""
Authored by Darren Colby and used for testing plotting functions
"""

import altair as alt
import pandas as pd
from prophet import Prophet
import gensim.downloader as api
from gensim.matutils import corpus2csc

ModuleNotFoundError: No module named 'media_insights'

In [2]:
df = pd.read_json("../../data/preprocessed_comments.json")
df.head()

Unnamed: 0,text,date,sentiment
0,love communitybuilde nature dog dog dog bring ...,2023-02-26 22:00:41+00:00,0.6369
1,brilliant play date lovely thing,2023-02-26 21:50:23+00:00,0.875
2,dog right playdate,2023-02-26 16:34:32+00:00,0.0
3,compassion thoughtfulness pet happy world place,2023-02-26 13:30:16+00:00,0.8625
4,glad relieved lol talk dog sister actually mea...,2023-02-26 12:08:31+00:00,0.8807


In [3]:
def get_forecast(df: pd.DataFrame, lead: int):
    ts_df = df[["date", "x"]].rename(columns={"date": "ds", "x": "y"})
    ts_df["ds"] = ts_df.ds.dt.tz_localize(None)

    df2 = df.copy(deep=True)
    df2["forecast"] = "Actual"

    model = Prophet()
    model.fit(ts_df)
    future_df = model.make_future_dataframe(periods=lead)
    forecast = model.predict(future_df)[["ds", "yhat"]].rename(columns={"ds": "date", 
                                                                    "yhat": "x"})
    forecast["forecast"] = "Predicted"
    return pd.concat([forecast, df2[["date", "x", "forecast"]]])


def plot_ts(df: pd.DataFrame, title: str, y_col: str, y_title: str, caption: bool, 
            lead: int):
    df_to_plot = get_forecast(df, lead)

    # Have to convert to datetime and remove timezone for altair
    df_to_plot["date"] = pd.to_datetime(df_to_plot.date, utc=True)

    # The base plot
    ts_plot = alt.Chart(df_to_plot).mark_line().encode(
        alt.X(f"yearmonthdatehours(date):T", 
              title="",

              # Only displays the first and last ticks and labels
              # Found suggestion on Stack Overflow from user jakevdp
              # https://stackoverflow.com/questions/59699412/altair-display-all-axis-ticks-but-only-some-tick-labels
              axis=alt.Axis(tickCount=df_to_plot.shape[0])),
        alt.Y(y_col, 
              title=y_title),
        color=alt.Color(
            "forecast:N",
            legend=alt.Legend(title="")
        )
    ).properties(
        title=title
    )


    # Only adds caption for sentiment time series
    if caption:
        # The caption, which is not a property in the Chart class
        # Found solution on Stack Overflow by user jakevdp
        # https://stackoverflow.com/questions/57244390/how-to-add-a-subtitle-to-an-altair-generated-chart
        caption = alt.Chart(
            {"values": [{"text": 
                        "Sentiment ranges from -1 for most negative to +1 for most positive"}]}
        ).mark_text(align="left").encode(
            text="text:N"
        )

        final_plot = alt.vconcat(
            ts_plot,
            caption
        )

        return final_plot
    
    return ts_plot


In [4]:
def plot_sentiment_ts(df: pd.DataFrame, lead:int):
    df_copy = df.copy(deep=True)
    df_copy["x"] = df_copy.sentiment

    return plot_ts(df_copy, "Changes in Sentiment", "x", "Sentiment", True, lead)

plot_sentiment_ts(df, 3)

17:00:50 - cmdstanpy - INFO - Chain [1] start processing
17:00:50 - cmdstanpy - INFO - Chain [1] done processing


In [5]:
def plot_comment_ts(df: pd.DataFrame, lead: int):
    new_df = df.copy(deep=True)
    new_df = new_df.date.value_counts().rename_axis("date").reset_index(name="x")

    return plot_ts(new_df, "Comments accross time", "x", "Comments", False, lead)

plot_comment_ts(df, 3)

In [10]:
glove = api.load("glove-wiki-gigaword-50")

In [42]:
"".join([string for string in df.text])
glove.n_similarity("this", "Darren")

0.79287374