In [2]:
# Standard plotly imports
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Data science imports
import pandas as pd
import numpy as np

In [8]:
# Data
df = pd.read_parquet("medium_data_2019_01_06")
df

Unnamed: 0,claps,days_since_publication,fans,link,num_responses,publication,published_date,read_ratio,read_time,reads,...,type,views,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python
119,2,574.858594,2,https://medium.com/p/screw-the-environment-but...,0,,2017-06-10 14:25:00,41.98,7,68,...,published,162,1859,0.001076,0,0,0,0,0,0
118,18,567.540639,3,https://medium.com/p/the-vanquishing-of-war-pl...,0,,2017-06-17 22:02:00,32.93,14,54,...,published,164,3891,0.004626,0,0,0,0,0,0
121,50,554.920762,19,https://medium.com/p/capstone-project-mercedes...,0,,2017-06-30 12:55:00,20.19,42,215,...,published,1065,12025,0.004158,0,0,0,0,1,1
122,0,554.078160,0,https://medium.com/p/home-of-the-scared-5af0fe...,0,,2017-07-01 09:08:00,35.85,9,19,...,published,53,2533,0.000000,0,0,0,0,0,0
114,0,550.090507,0,https://medium.com/p/the-triumph-of-peace-f485...,0,,2017-07-05 08:51:00,8.77,14,5,...,published,57,3892,0.000000,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,443,7.975446,97,https://towardsdatascience.com/the-copernican-...,2,Towards Data Science,2018-12-29 11:36:00,30.59,8,521,...,published,1703,1898,0.233404,0,1,0,1,0,0
18,412,5.713865,83,https://towardsdatascience.com/analyzing-mediu...,2,Towards Data Science,2018-12-31 17:53:00,20.47,12,457,...,published,2232,2813,0.146463,1,1,1,1,0,1
0,329,4.115077,34,https://medium.com/the-reality-project/announc...,1,The Reality Project,2019-01-02 08:15:00,39.23,6,164,...,published,418,1685,0.195252,1,1,0,0,0,0
1,309,4.064114,29,https://medium.com/the-reality-project/the-dis...,2,The Reality Project,2019-01-02 09:29:00,28.75,10,115,...,published,400,2278,0.135645,0,1,0,0,0,0


In [9]:
# Single Variable Distributions: Histograms and Boxplots
df['claps'].iplot(kind='hist', xTitle='claps', yTitle='count', title='Claps Distribution')

In [12]:
#  Overlaid histogram
def to_time(dt):
    return dt.hour + dt.minute / 60

df['time_started'] = df['started_date'].apply(to_time)
df['time_published'] = df['published_date'].apply(to_time)

df[["time_published", "time_started"]].iplot(
    kind="hist",
    bins=24,
    linecolor="black",
    opacity=0.8,
    histnorm="percent",
    barmode="overlay",
    xTitle="Time of day",
    yTitle="(%) of articles",
    title="Time Started and Time Published Overlaid",
)
#df[['time_started', 'time_published']].iplot(kind='hist', histnorm='percent', 
#                                           barmode='overlay', xTitle='Time of Day', 
#                                           yTitle='(%) of Article', title='Time Started and Time Published')

In [27]:
# Bar Plot
# For a bar plot, we need to apply some sort of aggregation function and then plot
df.groupby('publication').count()['fans'].iplot(
    kind='bar',
    yTitle='Number of Articles',
    linecolor='black',
    title='Articles by Publication'
)

df[[c for c in df if '<tag>' in c]].sum().iplot(
    kind='bar',
    xTitle='Tag',
    yTitle='Number of Articles with Tag',
    title='Frequency of Tags',
    sortbars=True
)

# Resample to monthly frequency and plot 
df2 = df[['views', 'reads', 'published_date']].set_index('published_date').\
        resample('M').mean()

df2.iplot(kind='bar', xTitle='Date', yTitle='Average',
         title='Monthly Average Views and Reads')

In [34]:
# Boxplot
df.pivot(columns='publication', values='fans').iplot(
        kind='box', yTitle='fans', title='Fans Distribution by Publication')

df[['claps', 'fans']].iplot(
    secondary_y='fans', secondary_y_title='Fans',
    kind='box', yTitle='Claps', title='Box Plot of Claps and Fans')

df[df['read_time'] <= 10].pivot(columns='read_time', values='reads').iplot(
    kind='box', colorscale='set2', xTitle='Read Time', yTitle='Number of Reads',
    title='Box Plot of Reads by Reading Time')

In [47]:
# Scatterplots
# + Time-Series

# Create a dataframe of Towards Data Science Articles
tds = df[df['publication'] == 'Towards Data Science'].set_index('published_date')

tds['read_time'].iplot(mode='lines+markers', opacity=0.8, size=8, symbol=1,
                      xTitle='Date', yTitle='Read Time (min)', title='Read Time Trends')

# Plot read time as a time series
tds[['claps', 'fans', 'title']].iplot(
    y='claps', mode='lines+markers', secondary_y='fans',
    secondary_y_title='Fans', xTitle='Date', yTitle='Claps', text='title',
    title='Fans and Claps over Time')

# Add in text annotations
tds_monthly_totals = tds.resample('M').sum()
tds_monthly_totals['text'] = [
    f'<span style="color:blue">{m}<span><br>words: {w:.0f}'
    for m, w in zip(
        tds_monthly_totals.index.month_name(), tds_monthly_totals["word_count"]
    )
]

tds_monthly_totals.iplot(mode='lines+markers+text', text='text', y='word_count',
                        opacity=0.8, xTitle='Date', yTitle='Word Count', 
                        title='Total Word Count by Month')

In [55]:
# Scatterplot By Category
df.iplot(x='read_time', y='read_ratio', 
        # Specify the category
        categories='publication', 
        xTitle='Read Time', yTitle='Reading Percent',
        title='Reading Percent vs Read Ratio by Publication')

# Log Scale
layout = dict(
    xaxis=dict(type="log", title="Word Count"),
    yaxis=dict(type="linear", title="views"),
    title="Views vs Word Count Log Axis",
)

df.sort_values("word_count").iplot(
    x="word_count",
    y="views",
    layout=layout,
    text="title",
    mode="markers",
    bestfit=True,
    bestfit_colors=["blue"],
)

tds.sort_values("word_count").iplot(
    x="word_count",
    y="views",
    layout=layout,
    text="title",
    mode="markers",
    bestfit=True,
    bestfit_colors=["blue"],
)

In [60]:
# Advanced Plots
# + Scatter Matrix
import plotly.figure_factory as ff

figure = ff.create_scatterplotmatrix(
    df[['claps', 'publication', 'views', 'read_ratio', 'word_count']], 
    height=1000, width=1000, text=df["title"],
    diag='histogram', index='publication')
iplot(figure)

In [64]:
# Correlation Heatmap
corrs = df.corr()

figure = ff.create_annotated_heatmap(
    z=corrs.values, x=list(corrs.columns), y=list(corrs.index),
    colorscale='Earth', annotation_text=corrs.round(2).values, 
    showscale=True, reversescale=True)

figure.layout.margin=dict(l=200,t=200)
figure.layout.height=800
figure.layout.width=1000

iplot(figure)