# Imports

In [1]:
%%capture
!pip install plotly
!pip install watermark

In [2]:
%%capture
!pip3 install --upgrade --no-cache-dir --extra-index-url http://pypi.cu/root/circleup/+simple/ --trusted-host pypi.cu cu-helio-insights==0.0.12

In [3]:
!pip install numpy

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
from spark_tools import *

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
#import seaborn as sns

import apollo
from apollo import OverrideConfiguration
from apollo import dataset
import apollo_artifacts
from apollo_artifacts import datasets

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
from insights.investor_tools.widgets.style import (
    Font,
    CU_PLOTLY_COLOR_SEQUENCE,
    CU_PLOTLY_COLORSCALE,
)

import datetime
from features.brand_score.data_frames import BrandMetricDataFrames
from ml.models.utils import remove_bad_mapped_accounts
from schema_registry.spark.utils import data_set_to_data_frame

from pyspark.sql.functions import lower, col, year, month, concat, lit, row_number
from pyspark.sql.window import Window
from schema_registry import get_schema
from spark_tools import c, F
from itertools import chain
from pyspark.sql.functions import create_map, lit, to_date

AttributeError: module 'numpy' has no attribute 'version'

In [None]:
import textwrap
def add_line_breaks(string, length):
    if string:
        return '<br>'.join(textwrap.wrap(string, length))

In [None]:
OverrideConfiguration(default_to_production=True).apply()

In [None]:
pd.options.display.max_rows = 300

In [None]:
spark.conf.set('spark.sql.execution.arrow.enabled', 'false')

In [None]:
%load_ext watermark
%watermark -v -m --iversions -g

# Set Times

In [None]:
now = datetime.datetime.now()
max_dt = datetime.date(now.year, now.month - 1 , now.day)
max_date = max_dt.strftime("%Y-%m")
#backup_max_date = datetime.date(now.year, now.month - 2 , now.day).strftime("%Y-%m")
year_ago_dt = datetime.date(now.year - 1, now.month , now.day)
year_ago_date = year_ago_dt.strftime("%Y-%m")
year2_ago_date = datetime.date(now.year - 2, now.month , now.day).strftime("%Y-%m")

# Time Series Brand Agg Data

In [None]:
#OLD
IG_COLS = [
    "normalized_url",
    "name",
    "instagram_id",
    "instagram_collected_on",
    "fct_date",
    "total_media_likes",
    "total_media_comments",
    "instagram_followed_by",
    "instagram_media_count",
]

In [None]:
#fb=datasets.facebook_post__3_0.latest_segment_for_each_component_value_df(["source_name"])


In [None]:
ig_sdf = (apollo
    .dataset('instagram__canonical_brand_account__ts_interpolated__1_0')
    #.latest_segment_df()
    .read()
    .select('*')
    .persist())

#ig_sdf.show()

In [None]:
ig_sdf.sort('y/m', ascending = True).show()

## Load likes & comment data

In [None]:
path = 's3a://circleup-helio-normalized/instagram/version=1_0_0/entity=media'
schema_id = 'instagram_media__normalized'
schema_version = '1_0_0'
w = Window.partitionBy(c.instagram_id, c.media_id).orderBy(c.collected_on.desc())
schema = get_schema(identifier=schema_id, version=schema_version)
df = spark.read.format('parquet').load(path, schema=schema.as_spark())
df = (
    df.withColumn("row_num", F.row_number().over(w))
    .where(c.row_num == 1)
    .drop("row_num")
)
df

In [None]:
#aggregate social activity data
df = df.withColumn('fct_date', concat(year('collected_on'), lit('-'), month('collected_on')))
likes_comm = df.groupBy('instagram_id','fct_date').sum('comments_count','likes_count')


# Input Brands

In [None]:
NORMALIZED_URLS = [
    'hellobubble.com',
    'neutrogena.com',
    'cerave.com',
    'cetaphil.com',
    'cleanandclear.com',
]

In [None]:
BRAND_NAMES = [
    'Bubble',
    'Neutrogena',
    'Cerave',
    'Cetaphil',
    'Clean & Clear'
]

In [None]:
ids=[
    ]

In [None]:
brand_map_di = dict(zip(NORMALIZED_URLS, BRAND_NAMES))

Create mapping for URLs to Instagram ID to merge social activity data

In [None]:
ids = ig_sdf.where((F.col('canonical_brand_url').isin(NORMALIZED_URLS)) & (~F.col('instagram_id').isNull()))
id_map = ids.select('canonical_brand_url',col('instagram_id').alias('id')).distinct()#.toPandas().set_index('canonical_brand_url').T.to_dict('list')
urls = list(id_map.select('canonical_brand_url').toPandas()['canonical_brand_url'])
ids = list(id_map.select('id').toPandas()['id'])
url_di = dict(zip(urls, ids))
mapping_expr = create_map([lit(x) for x in chain(*url_di.items())])
ig_sdf = ig_sdf.withColumn('instagram_id', mapping_expr[ig_sdf['canonical_brand_url']])

## Connect Follower Data to Social Activity 

In [None]:
ig_sdf = ig_sdf.withColumnRenamed('y/m','fct_date')
ig_sdf = ig_sdf.withColumn('fct_date', to_date(F.col('fct_date')))
likes_comm = likes_comm.withColumnRenamed('y/m','fct_date')
likes_comm = likes_comm.withColumn('fct_date', to_date(F.col('fct_date')))
brands_ig = ig_sdf.join(likes_comm, on = ['instagram_id','fct_date'], how = 'left')
brands_ig = brands_ig.where(F.col('canonical_brand_url').isin(NORMALIZED_URLS))


In [None]:
window = Window.partitionBy(["canonical_brand_url", 'fct_date']).orderBy(col("execution_period").desc())
brands_ig = brands_ig.withColumn("row",row_number().over(window)).filter(col("row") == 1).drop("row").sort('fct_date')

In [None]:
brands_ig.filter(F.col('fct_date') >= '2018-01-01').toPandas().sort_values('fct_date')

In [None]:
IG_COLS = [
    "canonical_brand_url",
    "instagram_id",
    "fct_date",
    'fct_followers_count', 
    'fct_following_count', 
    'est_following_count', 
    'est_posts_count', 
    'est_followers_count', 
    'fct_posts_count',
    'sum(comments_count)',
    'sum(likes_count)'
]

In [None]:
brands_ig = brands_ig.select(IG_COLS).dropna().sort(F.col('fct_date')).dropDuplicates()

## Calculate Social Engagement Metrics

In [None]:
ig_brand_sdf = (
    brands_ig
    .withColumn('likes_per_post_pax', F.col('sum(likes_count)') / F.col('est_posts_count') / F.col('est_followers_count'))
    .withColumn('comments_per_post_pax', F.col('sum(comments_count)') / F.col('est_posts_count') / F.col('est_followers_count'))
    .withColumn('likes_per_post', F.col('sum(comments_count)') / F.col('est_posts_count'))
    #.filter((F.col('canonical_brand_url').isin(NORMALIZED_URLS)) | (F.col('instagram_id').isin(ids)))
)


In [None]:
chart_data_0 = ig_brand_sdf.toPandas()
if brand_map_di:
    chart_data_0['name'] = chart_data_0['canonical_brand_url'].map(brand_map_di)

In [None]:
chart_data_0

In [None]:
# sort plotting order
chart_data_0 = chart_data_0.sort_values(
    'canonical_brand_url',
     ascending=True
)

# Plot

In [None]:
fig = px.scatter(chart_data_0, 
                 x='fct_date',
                 y='likes_per_post',
                 size='comments_per_post',
                 color='name',
                 opacity=.8,
                 labels={
                     'likes_per_post': 'Likes per Post',
                     'comments_per_post': 'Comments per Post',
                     'name': 'Size of Bubble =<br>Comments per Post<br><br><b>Brand</b>',
                     'fct_date': 'Date',
                 },
                 title='Social Media Engagement Over Time'
                 
                 ,color_discrete_sequence=CU_PLOTLY_COLOR_SEQUENCE
                )


sizeref = 2 * max(chart_data_0['comments_per_post'])/(100**2)
smaller_font = dict(Font.plot_title.value)
smaller_font.update({'size':14})
fig.update_traces(
    marker=dict(sizemode='area',
                sizeref=sizeref,
                
               ),
    textfont=smaller_font
)

fig.update_layout(
    width=1200,
    height=800,
)

fig.update_layout(
    font=Font.plot_title.value,
    plot_bgcolor="white",
     title={"x": 0.5},
    colorway=CU_PLOTLY_COLOR_SEQUENCE,
    coloraxis={"autocolorscale": True},
)

updatemenus = list([
    dict(active=1,
         buttons=list([
            dict(label='Log Scale',
                 method='update',
                 args=[{'visible': [True]},
                        {'yaxis': {'type': 'log', 'title':"Likes per Post"}
                        }]),
            dict(label='Linear Scale',
                 method='update',
                 args=[{'visible': [True]},
                       {'yaxis': {'type': 'linear', 'title':"Likes per Post"}
                       }])
            ]),
         xanchor="left",
         yanchor="top",
         y=1.05,
        )
    ])

fig.update_layout(dict(updatemenus=updatemenus))
fig.show()

In [None]:
chart_data_0

In [None]:
fig.write_html(f"data/{NORMALIZED_URLS[0]}_ig_engagement.html")

In [None]:
fig = px.scatter(chart_data_0, 
                 x='fct_date',
                 y='likes_per_post_pax',
                 size='comments_per_post_pax',
                 color='name',
                 labels={
                     'likes_per_post_pax': 'Likes per Post per Follower',
                     'comments_per_post_pax': 'Comments per Post per Follower',
                     'normalized_url':'Brand',
                     'name': 'Size of Bubble =<br>Comments<br>per Post<br>per Follower<br><br><b>Brand</b>',
                     'fct_date': 'Date',
                 },
                 title='Normalized Social Media Engagement Over Time',
                 color_discrete_sequence=CU_PLOTLY_COLOR_SEQUENCE
                )


sizeref = 2 * max(chart_data_0['comments_per_post_pax'])/(100**2)
smaller_font = dict(Font.plot_title.value)
smaller_font.update({'size':14})
fig.update_traces(
    marker=dict(sizemode='area',
                sizeref=sizeref
               ),
    textfont=smaller_font
)

fig.update_layout(
    width=1200,
    height=800,
)

fig.update_layout(
    font=Font.plot_title.value,
    plot_bgcolor="white",
     title={"x": 0.5},
    colorway=CU_PLOTLY_COLOR_SEQUENCE,
    coloraxis={"autocolorscale": True},
)

updatemenus = list([
    dict(active=1,
         buttons=list([
            dict(label='Log Scale',
                 method='update',
                 args=[{'visible': [True]},
                        {'yaxis': {'type': 'log', 'title':"Likes per Post per Follower"}
                        }]),
            dict(label='Linear Scale',
                 method='update',
                 args=[{'visible': [True]},
                       {'yaxis': {'type': 'linear', 'title':"Likes per Post per Follower"}
                       }])
            ]),
         xanchor="left",
         yanchor="top",
         y=1.05,
        )
    ])

fig.update_layout(dict(updatemenus=updatemenus))
fig.show()

In [None]:
fig.write_html(f"data/{NORMALIZED_URLS}_ig_engagement_normalized.html")

# Per Post Basis

In [None]:
# spark.read.parquet(f's3a://circleup-helio-normalized/instagram/version=1_0_0/entity=media/year=2020/month=05').printSchema()

In [None]:
class AdhocDF(BrandMetricDataFrames):
    
    def load_and_persist(self, reference_date: datetime):
        self._load_instagram_data()
        
    def _load_instagram_data(self):
        by_url_window = Window.partitionBy(c.normalized_url).orderBy(
            c.collected_on.desc()
        )

        self.instagram_account_data = datasets.instagram_account__1_0.read()

        self.instagram_account_data = self.instagram_account_data.select(
            c.collected_on, c.instagram_id, c.followers_count, c.posts_count, 
        )

        brand_map_data_ig = datasets.instagram_brand_map__1_0.read()

        brand_map_data_ig = (
            brand_map_data_ig.withColumn("row_num", F.row_number().over(by_url_window))
            .where(c.row_num == 1)
            .select(c.normalized_url, c.instagram_id)
            .repartition(32)
        )
        brand_map_data_ig = F.broadcast(brand_map_data_ig)

        self.instagram_account_data = self.instagram_account_data.join(
            brand_map_data_ig, on="instagram_id", how="inner"
        )

        self.instagram_account_data = self.instagram_account_data.repartition(
            c.normalized_url
        )
        self.instagram_account_data = remove_bad_mapped_accounts(
            self.instagram_account_data, "instagram_id"
        )

        self.instagram_account_data = self.instagram_account_data.persist()

        self.instagram_media_data = data_set_to_data_frame(
            session=self.spark,
            data_set=self.data_sets["instagram_media"],
            data_set_mode='prod',
            execution_date=None,
        )

        self.instagram_media_data = self.instagram_media_data.select(
            c.collected_on, c.instagram_id, c.media_id, c.likes_count, c.comments_count, c.media_created_time, c.caption
        ).dropna()
        self.instagram_media_data = self.instagram_media_data.repartition(
            "instagram_id"
        ).persist()
        
    def instagram_for_date(self, reference_date: datetime):
        by_url_window = Window.partitionBy(c.normalized_url).orderBy(
            c.collected_on.desc()
        )
        by_media_id_window = Window.partitionBy(c.instagram_id, c.media_id).orderBy(
            c.collected_on.desc()
        )

        account = self._latest_within_date_window(
            self.instagram_account_data, by_url_window, reference_date
        ).repartition(c.instagram_id)
        account = account.withColumnRenamed("collected_on", "instagram_collected_on")

        media = self._latest_within_date_window(
            self.instagram_media_data,
            by_media_id_window,
            reference_date,
            filter_stale=False,
        )

        result = account.join(media, on="instagram_id", how="left")
        return result

In [None]:
dfs = AdhocDF(spark)
dfs.load_and_persist(max_dt)
ig_post_sdf = dfs.instagram_for_date(max_dt).persist()

In [None]:
joined_post_sdf = (
    ig_post_sdf
    .withColumn('post_date', F.date_format(F.col('media_created_time'),"yyyy-MM-dd"))
    .withColumn('fct_date', F.date_format(F.col('media_created_time'),"yyyy-MM"))
    .join(
        ig_sdf.select('normalized_url', 'name','fct_date', 'instagram_followed_by'),
        on=['normalized_url', 'fct_date'],
        how='left'
    ) # can't find where name comes from here, so reused
    .withColumn('likes_pax', F.col('likes_count') / F.col('instagram_followed_by'))
    .withColumn('comments_pax', F.col('comments_count') / F.col('instagram_followed_by'))
)

In [None]:
filtered_ig_post_sdf = (
    joined_post_sdf
    .filter(F.col('normalized_url').isin(NORMALIZED_URLS))
)

In [None]:
# filtered_ig_post_sdf.count()

In [None]:
chart_data_1 = filtered_ig_post_sdf.sort(F.col('normalized_url')).toPandas()

In [None]:
wrap_length = 50
chart_data_1['caption'] = chart_data_1['caption'].apply(add_line_breaks, args=(wrap_length,))

In [None]:
chart_data_1 = chart_data_1.dropna(subset=['post_date', 'name', 'likes_pax', 'comments_pax'])

In [None]:
chart_data_1 = chart_data_1[chart_data_1.post_date >= year2_ago_date].sort_values(
    'normalized_url',
    ascending=False
)

In [None]:
if brand_map_di:
    chart_data_1['name'] = chart_data_1['normalized_url'].map(brand_map_di)

# Plot

In [None]:
fig = px.scatter(chart_data_1.sort_values('normalized_url'), 
                 x='post_date',
                 y='likes_pax',
                 size='comments_pax',
                 color='name',
                 labels={
                     'likes_pax': 'Likes per Follower',
                     'comments_pax': 'Comments per Follower',
                     'name': 'Size of Bubble =<br>Comments per Follower<br><br><b>Brand</b>',
                     'post_date': 'Date',
                 },
                 hover_data=['caption'],
                 title='Normalized Social Media Engagement Over Time per Post',
                 color_discrete_sequence=CU_PLOTLY_COLOR_SEQUENCE,
                 opacity=0.5,
                )


sizeref = 2 * max(chart_data_1['comments_pax'])/(100**2)
smaller_font = dict(Font.plot_title.value)
smaller_font.update({'size':14})
fig.update_traces(
    marker=dict(sizemode='area',
                sizeref=sizeref
               ),
    textfont=smaller_font
)

fig.update_layout(
    width=1200,
    height=800,
)

fig.update_layout(
    font=Font.plot_title.value,
    plot_bgcolor="white",
     title={"x": 0.5},
    colorway=CU_PLOTLY_COLOR_SEQUENCE,
    coloraxis={"autocolorscale": True},
)

updatemenus = list([
    dict(active=1,
         buttons=list([
            dict(label='Log Scale',
                 method='update',
                 args=[{'visible': [True]},
                        {'yaxis': {'type': 'log', 'title':"Likes per Follower"}
                        }]),
            dict(label='Linear Scale',
                 method='update',
                 args=[{'visible': [True]},
                       {'yaxis': {'type': 'linear', 'title':"Likes per Follower"}
                       }])
            ]),
         xanchor="left",
         yanchor="top",
         y=1.02,
        )
    ])

fig.update_layout(dict(updatemenus=updatemenus))
fig.show()

In [None]:
fig.write_html(f"data/{NORMALIZED_URLS[0]}_ig_engagement_post.html")

In [None]:
fig = px.scatter(chart_data_1, 
                 x='post_date',
                 y='likes_count',
                 size='comments_count',
                 color='name',
                 labels={
                     'likes_count': 'Likes',
                     'comments_count': 'Comments',
                     'name': 'Size of Bubble =<br>Comments<br><br><b>Brand</b>',
                     'post_date': 'Date',
                 },
                 title='Social Media Engagement Over Time per Post',
                 hover_data=['caption'],
                 color_discrete_sequence=CU_PLOTLY_COLOR_SEQUENCE,
                 opacity=0.5,
                )


sizeref = 2 * max(chart_data_1['comments_count'])/(100**2)
smaller_font = dict(Font.plot_title.value)
smaller_font.update({'size':14})
fig.update_traces(
    marker=dict(sizemode='area',
                sizeref=sizeref
               ),
    textfont=smaller_font
)

fig.update_layout(
    width=1250,
    height=800,
)

fig.update_layout(
    font=Font.plot_title.value,
    plot_bgcolor="white",
     title={"x": 0.5},
    colorway=CU_PLOTLY_COLOR_SEQUENCE,
    coloraxis={"autocolorscale": True},
)

updatemenus = list([
    dict(active=1,
         buttons=list([
            dict(label='Log Scale',
                 method='update',
                 args=[{'visible': [True]},
                        {'yaxis': {'type': 'log', 'title':"Likes"}
                        }]),
            dict(label='Linear Scale',
                 method='update',
                 args=[{'visible': [True]},
                       {'yaxis': {'type': 'linear', 'title':"Likes"}
                       }])
            ]),
         xanchor="left",
         yanchor="top",
         y=1.02,
        )
    ])

fig.update_layout(dict(updatemenus=updatemenus))
fig.show()

# Last Year Post Aggs

In [None]:
ig_post_agg_sdf = (
    joined_post_sdf
    .filter(F.col('post_date') >= year_ago_dt)
    .groupby('normalized_url', 'name')
    .agg(
        F.count('*').alias('post_count'),
        F.mean(F.col('likes_pax')).alias('avg_likes_pax'),
        F.mean(F.col('comments_pax')).alias('avg_comments_pax'),
    )
)

In [None]:
chart_data_2 = ig_post_agg_sdf.filter(F.col('normalized_url').isin(NORMALIZED_URLS)).toPandas()

In [None]:
chart_data_2 = chart_data_2.sort_values('normalized_url')

In [None]:
chart_data_2

In [None]:
chart_data_2 = chart_data_2.sort_values(
    'normalized_url',
    ascending=False
).dropna(subset=['avg_likes_pax', 'avg_comments_pax'])

In [None]:
if brand_map_di:
    chart_data_2['name'] = chart_data_2['normalized_url'].map(brand_map_di)

In [None]:
fig = px.scatter(chart_data_2.sort_values('normalized_url'), 
                 x='avg_comments_pax',
                 y='avg_likes_pax',
                 size='post_count',
                 color='name',
                 text='name',
                 labels={
                     'avg_likes_pax': 'Average Likes per Follower',
                     'avg_comments_pax': 'Average Comments per Follower',
                     'name': 'Size of Bubble =<br>Post Count<br><br><b>Brand<b/>',
                 },
                 title=f'Social Media Engagement Since {year_ago_date}',
                 color_discrete_sequence=CU_PLOTLY_COLOR_SEQUENCE,
                )


sizeref = 2 * max(chart_data_2['post_count'])/(100**2)
smaller_font = dict(Font.plot_title.value)
smaller_font.update({'size':16})
fig.update_traces(
    marker=dict(sizemode='area',
                sizeref=sizeref
               ),
    textfont=smaller_font
)

fig.update_layout(
    width=1200,
    height=800,
)

fig.update_layout(
    font=Font.plot_title.value,
    plot_bgcolor="white",
     title={"x": 0.5},
    colorway=CU_PLOTLY_COLOR_SEQUENCE,
    coloraxis={"autocolorscale": True},
    xaxis_tickformat='.4f',
     xaxis = dict(
        tickmode = 'array',
        tickvals = [-0.01, 0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08],
    )
#     showlegend=False
)

updatemenus = list([
    dict(active=1,
         buttons=list([
            dict(label='Log Scale',
                 method='update',
                 args=[{'visible': [True]},
                        {'yaxis': {'type': 'log', 'title':"Average Likes per Follower"}
                        }]),
            dict(label='Linear Scale',
                 method='update',
                 args=[{'visible': [True]},
                       {'yaxis': {'type': 'linear', 'title':"Average Likes per Follower"}
                       }])
            ]),
         xanchor="left",
         yanchor="top",
         y=1.02,
        )
    ])

fig.update_layout(dict(updatemenus=updatemenus))
fig.show()

In [None]:
fig.write_html(f"data/{NORMALIZED_URLS[0]}_ig_engagement_year.html")

In [None]:
chart_data_2.to_csv('bain_social_agg.csv')

In [None]:
chart_data_2