In [1]:
import glob
import pandas as pd
from datetime import datetime
import plotly.express as px
from ipywidgets import widgets

In [2]:
path = 'data/'

In [3]:
all_games = glob.glob(path + "World*.csv")
games = pd.concat((pd.read_csv(f, usecols=[1, 2, 3, 4, 5], header=None) for f in all_games), axis=0, ignore_index=True)
games = games.rename(columns={1:"Streamer", 2:"Date", 3:"Category", 4:"Viewership", 5:"Langage"})
print(f" ** Memory usage of the file - {sum(games.memory_usage()) * 0.000001} MB for {len(games.index)} Rows")

 ** Memory usage of the file - 11.490207999999999 MB for 287252 Rows


In [4]:
languages = pd.read_csv('languages.csv', sep='|')

In [5]:
# map Langage values to languages names
games['Langage'] = games['Langage'].map(languages.set_index('code')['fr'])

In [6]:
games['Date'] = pd.to_datetime(games['Date'])

In [7]:
games.head()

Unnamed: 0,Streamer,Date,Category,Viewership,Langage
0,Emiru,2022-04-01 00:00:03,Just Chatting,45368,anglais
1,ROSHTEIN,2022-04-01 00:00:03,Slots,34077,anglais
2,HasanAbi,2022-04-01 00:00:03,Just Chatting,30861,anglais
3,TSM_ImperialHal,2022-04-01 00:00:03,Apex Legends,29138,anglais
4,alanzoka,2022-04-01 00:00:03,Midnight Ghost Hunt,28314,portugais


## AVERAGE VIEWERS PER STREAMER

In [8]:
avg = games[['Streamer', 'Langage', 'Viewership']].groupby(['Streamer', 'Langage']).agg(['mean', 'count'])['Viewership']
avg = avg.reset_index()
avg['mean'] = avg['mean'].apply(lambda x : int(x))
avg['count'] = avg['count'].apply(lambda x : float(x/4))

In [9]:
avg = avg[avg['count'] >= 1 ]

In [10]:
avg

Unnamed: 0,Streamer,Langage,mean,count
2,1PVCS,français,5860,31.00
3,1pos_terroni,russe,2792,1.50
4,2010misterchip,espagnol,5090,1.25
7,39daph,anglais,6277,128.25
8,3gerardpique,espagnol,104984,1.25
...,...,...,...,...
3617,해야님_,coréen,10482,1.00
3618,햇살살,coréen,2016,1.25
3619,헤징,coréen,2282,3.75
3621,형독방송,coréen,2803,1.25


In [12]:
fig = px.scatter(
    avg,
    x='mean',
    y='Langage',
    log_x=True,
    color='Streamer',
    size='count',
    size_max=50,
    hover_name='Streamer',
    hover_data=['mean', 'Langage', 'count'],
    height=800,
    width=1300,
    title='Viewers per Langage per Streamer',
    template='plotly_white',
    labels=False,
    # color_discrete_sequence=px.colors.qualitative.Dark24,
)
fig.update_layout(showlegend=False)
fig.show()

In [40]:
import plotly.graph_objects as go
# iterate over dataset to make boxplot for each language
# make a red colorbar and set the range to 0-100
# give each boxplot a different color according to the number of points in the boxplot
# hide the points
# import sqrt
from math import sqrt

fig = go.Figure()
for i in range(len(avg['Langage'].unique())):
    fig.add_trace(go.Box(
        x=avg[avg['Langage'] == avg['Langage'].unique()[i]]['Langage'],
        y=avg[avg['Langage'] == avg['Langage'].unique()[i]]['mean'],
        name=avg['Langage'].unique()[i],
        boxmean=True,
        jitter=avg[avg['Langage'] == avg['Langage'].unique()[i]].shape[0]/avg[avg['Langage'] == 'anglais'].shape[0],
    ))

fig.update_layout(
    title='Viewership by Langage',
    yaxis_title='Viewership',
    xaxis_title='Langage',
    # boxmode='group',
    template='plotly_white',
    height=700,
    width=1200,
    showlegend=False,
)
fig.update_yaxes(type="log")

fig.show()


In [12]:
def scatter(df):
    fig = px.scatter(
        df,
        x='mean',
        y='count',
        color='Langage',
        size_max=1,
        hover_name='Streamer',
        hover_data=['mean', 'Langage'],
        height=700,
        width=900,
        title='Average Viewership vs Hours streamed',
        template='plotly_white',
        labels=False,
        color_discrete_sequence=px.colors.qualitative.Dark24,
    )
        
    # change y axis label to hours streamed
    fig.update_yaxes(title_text='Hours streamed')
    # change x axis label to average viewership
    fig.update_xaxes(title_text='Average Viewership')

    return fig

langage = widgets.Dropdown(
    options=['All'] + avg['Langage'].unique().tolist(),
    value='All',
    description='Langage:',
    disabled=False,
)

def update_plot(langage):
    df = avg.copy()
    if langage != 'All':
        df = df[df['Langage'] == langage]
    else :
        df = df
    
    # Plot it (only if there's data to plot)
    if len(df) > 0:
        scatter(df).show()
        
widgets.interact(update_plot, langage=langage)


interactive(children=(Dropdown(description='Langage:', options=('All', 'français', 'russe', 'espagnol', 'angla…

<function __main__.update_plot(langage)>

In [22]:
df = games[['Streamer', 'Category', 'Viewership', 'Date']].groupby(['Streamer', 'Category', 'Date']).agg(['mean'])['Viewership'].reset_index()
df = df.groupby(['Category', 'Date']).agg(['sum'])['mean'].reset_index()
df = df.groupby(['Category']).agg(['sum'])['sum'].reset_index()
df = df.sort_values(by=['sum'], ascending=False)





In [31]:
fig = px.bar(
    df.head(20),
    y='sum',
    x='Category',
    color='Category',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    height=700,
    width=1200,
    title='Most Streamed Categories',
    template='plotly_white',
    labels=False,
) 
fig.update_layout(showlegend=False)
fig.update_yaxes(title_text='Cumulative Viewership')
fig.show()


## SUNBURST JOURS => LANGUES => STREAMER (Viewers)

https://plotly.com/python/sunburst-charts/


Structure de données:</br>
["Jours de la semaine", "Langues", "Streamers"]</br>
parents = ["", "Jours de la semaine", "Langues"]</br>
values = [total views per day, total views per langage, Average viewers per streamer]</br>
Peu importe si la donnée chiffrée est erronée, tant que les proportions sont respectées

In [86]:
day = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

In [87]:
gamesDay = games.groupby([pd.Grouper(key='Date', axis=0, freq='D'), 'Langage']).agg('sum').reset_index()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [88]:
gamesDay['Date'] = gamesDay['Date'].apply(lambda x : day[x.weekday()])

In [89]:
gamesDay.head()

Unnamed: 0,Date,Langage,Viewership
0,Friday,Honh Kong,49214
1,Friday,allemand,2920628
2,Friday,anglais,48107879
3,Friday,arabe,30916
4,Friday,chinois,1374876


In [94]:
fig = px.sunburst(
    gamesDay,
    path=["Date", "Langage"],
    values="Viewership",
    template='plotly_white',
)

fig.update_layout(
    extendsunburstcolors=True,
    title_text='Total Viewership per Day per Langage',
    title_x=0.5,
    # title_y=0.5,
    title_font_size=20,
    title_font_family='Arial',
    # title_font_color='white',
    title_xanchor='center',
    # title_yanchor='auto',
    title_xref='paper',
    title_yref='paper',
)

fig.show()