In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
import scipy
%load_ext autoreload
%autoreload 2

alt.data_transformers.enable('data_server')
alt.renderers.enable('mimetype')

RendererRegistry.enable('mimetype')

In [2]:
# inital wrangling

disc_events = pd.read_csv(os.path.join(os.pardir, "data", "additional", "discussions.csv"))

disc_topics = pd.read_csv(os.path.join(os.pardir, "data", "additional", "discussion_topics.csv"))
disc_topics = disc_topics[['id', 'title', 'todo_date_date']]

disc_merged = pd.merge(disc_topics, disc_events, how = "outer", left_on = 'id', right_on = 'discussion_topic_id')

grades = pd.read_csv(os.path.join(os.pardir, "data", "additional", "gradebook.csv"))

# get list of column names
grade_cols = list(grades.columns)

# subset of cols we want
grade_cols_incldue = ['Student',
 'Assignment 1 Current Score',
 'Assignment 2 Current Score',
 'Assignment 3 Current Score',
 'Participation & engagement Current Score',
 'Current Score']

# grades df to merge, drops first two rows as they do not contain important info 
grade_merge = grades[grade_cols_incldue].drop([0,1]).reset_index(drop=True)

module_items = pd.read_csv(os.path.join(os.pardir, "data", "additional", "module_items.csv"))

module_discussions = module_items[['title', 'module_id', 'module_name']]

disc_merged2 = pd.merge(disc_merged, module_discussions, how = 'left', on = 'title')

full_table = pd.merge(disc_merged2, grade_merge, how = 'inner', left_on = 'actor_id', right_on = 'Student')
full_table['timestamp'] = pd.to_datetime(full_table['timestamp'])
full_table['Current Score'] = full_table['Current Score'].astype(float)

full_table.head()

Unnamed: 0,id,title,todo_date_date,actor_id,membership_role,timestamp,post_id,post_parent_id,discussion_topic_title,discussion_topic_id,...,post_message_length,count_of_likes,module_id,module_name,Student,Assignment 1 Current Score,Assignment 2 Current Score,Assignment 3 Current Score,Participation & engagement Current Score,Current Score
0,132f3fab56d60839d727b966a76c1b1e,Assignment 1 Discussion forum,,LEARNER_3,"[""Learner""]",2033-01-28 06:55:40+00:00,abb6c24171f8b195cf0d050858e4ff18,cf6816ae103ab5e2c82d6595eb49e02e,Assignment 1 Discussion forum,132f3fab56d60839d727b966a76c1b1e,...,564.0,0.0,,,LEARNER_3,94,92,82,90,89.0
1,132f3fab56d60839d727b966a76c1b1e,Assignment 1 Discussion forum,,LEARNER_3,"[""Learner""]",2033-01-29 05:54:31+00:00,3ac8537702d3d7d8bf033d73a4491591,a250f9cde1c58d4dba3d4f918e82a018,Assignment 1 Discussion forum,132f3fab56d60839d727b966a76c1b1e,...,226.0,0.0,,,LEARNER_3,94,92,82,90,89.0
2,132f3fab56d60839d727b966a76c1b1e,Assignment 1 Discussion forum,,LEARNER_3,"[""Learner""]",2033-02-25 08:02:17+00:00,4b399c6443b76327cda3cf5cca76441a,108d12eb22a137b729bfa773f70df5b2,Assignment 1 Discussion forum,132f3fab56d60839d727b966a76c1b1e,...,58.0,0.0,,,LEARNER_3,94,92,82,90,89.0
3,132f3fab56d60839d727b966a76c1b1e,Assignment 1 Discussion forum,,LEARNER_3,"[""Learner""]",2033-02-25 08:03:05+00:00,8e41a53c2213fc897bce39b56d59ddd4,71b73cc230589c4a8991dcaed3ce53c2,Assignment 1 Discussion forum,132f3fab56d60839d727b966a76c1b1e,...,98.0,0.0,,,LEARNER_3,94,92,82,90,89.0
4,e1f90c16c123e0f96b2af7d94a1c335c,Introduce yourself,2033-01-15 20:06:09+00:00,LEARNER_3,"[""Learner""]",2033-01-10 13:20:12+00:00,45ba74eafbac3a413fbcf903bbf3ba2e,05fbad9bf011e3f22026af152329e25b,Introduce yourself,e1f90c16c123e0f96b2af7d94a1c335c,...,245.0,0.0,0347a47759cd8e24b01a5a76fe965be9,Week 1: Getting started,LEARNER_3,94,92,82,90,89.0


In [3]:
import datetime as dt



def discussion_summary(session_df, start_date, end_date):

    summary_df = session_df.copy(deep=True)

    summary_df = summary_df[(summary_df.timestamp.dt.date > start_date) & (summary_df.timestamp.dt.date < end_date)]

    post_count = summary_df.groupby('Student')['post_id'].count().rename('post_count')
    like_sum = summary_df.groupby('Student')['count_of_likes'].sum().rename('like_sum')
    message_sum = summary_df.groupby('Student')['post_message_length'].sum().rename('message_sum')
    message_mean = summary_df.groupby('Student')['post_message_length'].mean().rename('message_mean')
    course_grade = summary_df.groupby('Student')['Current Score'].mean().rename('current_score')

    post_count_percent = pd.Series(scipy.stats.norm.cdf(post_count, np.mean(post_count), np.std(post_count)), index=post_count.index).rename('post_count__perc')
    like_sum_percent = pd.Series(scipy.stats.norm.cdf(like_sum, np.mean(like_sum), np.std(like_sum)), index=like_sum.index ).rename('like_sum__perc')
    message_sum_percent = pd.Series(scipy.stats.norm.cdf(message_sum, np.mean(message_sum), np.std(message_sum)), index=message_sum.index).rename('message_sum__perc')


    results = pd.DataFrame(data =(post_count, post_count_percent, like_sum, like_sum_percent, message_sum, message_mean, message_sum_percent, course_grade)).T.reset_index()

    return results

start_date =  dt.date(2033, 2,25)
end_date =  dt.date(2033, 3,25)


test_vis = discussion_summary(full_table, start_date, end_date)

test_vis


Unnamed: 0,Student,post_count,post_count__perc,like_sum,like_sum__perc,message_sum,message_mean,message_sum__perc,current_score
0,LEARNER_1,3.0,0.207404,1.0,0.354908,6756.0,2252.0,0.230049,80.9
1,LEARNER_10,12.0,0.593939,5.0,0.946568,13683.0,1140.25,0.561791,78.4
2,LEARNER_11,6.0,0.321177,1.0,0.354908,5756.0,959.333333,0.192759,84.3
3,LEARNER_12,7.0,0.364148,1.0,0.354908,8403.0,1200.428571,0.299418,81.2
4,LEARNER_13,14.0,0.681438,5.0,0.946568,14112.0,1008.0,0.583514,85.5
5,LEARNER_14,6.0,0.321177,1.0,0.354908,17299.0,2883.166667,0.733126,84.4
6,LEARNER_15,3.0,0.207404,0.0,0.192631,3240.0,1080.0,0.116521,76.2
7,LEARNER_16,17.0,0.794682,1.0,0.354908,28471.0,1674.764706,0.980514,82.0
8,LEARNER_17,2.0,0.175545,0.0,0.192631,2352.0,1176.0,0.095574,65.7
9,LEARNER_18,15.0,0.721983,0.0,0.192631,14735.0,982.333333,0.614594,69.7


In [4]:
# likes per learner

alt.Chart(disc_merged).mark_bar().encode(
    alt.Y('actor_id', sort='x'),
    alt.X('count_of_likes')
)

  for col_name, dtype in df.dtypes.iteritems():


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [5]:
# post length per disscussion topic

alt.Chart(full_table).mark_bar().encode(
    alt.X('post_message_length'),
    alt.Y('title'),
    alt.Color('post_message_length')).facet('actor_id', columns=3)

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [6]:
# likes per assignment

alt.Chart(disc_merged).mark_bar().encode(
    alt.Y('title', sort='x'),
    alt.X('count_of_likes')
)

  for col_name, dtype in df.dtypes.iteritems():


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [8]:
alt.Chart(full_table.dropna()).mark_rect().encode(
    alt.X('Current Score', bin=alt.Bin(maxbins=50), title='Course grade'),
    alt.Y('post_message_length', bin=alt.Bin(maxbins=40), title='Message length'),
    alt.Color('count()', title='Number of Records'))

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [9]:
alt.Chart(full_table).mark_circle().encode(
    alt.Y('mean(post_message_length)', title='Mean post length'),
    alt.X('Current Score'),
    alt.Color('Student'),
    tooltip='Student'
)

  for col_name, dtype in df.dtypes.iteritems():


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html
