### Analyze edX BigQuery data for a typical course

This one is for 8.01 2015-16

In [7]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import plotly.plotly as py

df = pd.read_csv('sample_data/data_person_course.csv')

In [8]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [9]:
data = [go.Scatter(x=df.nplay_video,y=df.nproblem_check,text=df.username, mode = 'markers')]
layout = go.Layout(title="nproblem_check vs. nplay_video",
                   xaxis=dict(title='nplay_video'),
                   yaxis=dict(title='nproblem_check'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [10]:
data = [go.Scatter(x=df.nvideos_total_watched,y=df.nshow_answer,text=df.username, mode = 'markers')]
layout = go.Layout(title="nvideos_total_watched vs. nshow_answer",
                   xaxis=dict(title='nvideos_total_watched'),
                   yaxis=dict(title='nshow_answer'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
data = [go.Scatter(x=df.nproblem_check,y=df.nshow_answer,text=df.username, mode = 'markers')]
layout = go.Layout(title="nproblem_check vs. nshow_answer",
                   xaxis=dict(title='nproblem_check'),
                   yaxis=dict(title='nshow_answer'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [12]:
data = [go.Scatter(x=df.ndays_act,y=df.nevents,text=df.username, mode = 'markers')]
layout = go.Layout(title="ndays_act vs. nevents",
                   xaxis=dict(title='ndays_act'),
                   yaxis=dict(title='nevents'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [13]:
x = df.nprogcheck
data = [go.Histogram(x=x, 
                     xbins=dict(
                         start=0,
                         end=40,
                         size=1),
                     opacity=0.75)]
layout = go.Layout(title="nprogcheck",
                   yaxis=dict(title='nprogcheck'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [14]:
dg = pd.read_csv('sample_data/data_time_on_task_total.csv')

In [15]:
problem_time = go.Scatter(x=dg.total_time_30,y=dg.total_problem_time_30,name='problem_time',text=dg.username, mode = 'markers')
video_time = go.Scatter(x=dg.total_time_30,y=dg.total_video_time_30,name='video_time',text=dg.username, mode = 'markers')
text_time = go.Scatter(x=dg.total_time_30,y=dg.total_text_time_30,name='text_time',text=dg.username, mode = 'markers')

data = [problem_time, video_time, text_time]
layout = go.Layout(title="total_time breakdown",
                   xaxis=dict(title='total_time'),
                   yaxis=dict(title='total_time'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [16]:
y0 = dg.total_problem_time_30
y1 = dg.total_video_time_30
y2 = dg.total_text_time_30

trace0 = go.Box(y=y0,name='problem_time',text=dg.username)
trace1 = go.Box(y=y1,name='video_time',text=dg.username)
trace2 = go.Box(y=y2,name='text_time',text=dg.username)

data = [trace0,trace1,trace2]
layout = go.Layout(title="total_time box plot",
                   yaxis=dict(title='total_time (in seconds)'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [17]:
dh = pd.read_csv('sample_data/data_show_ans_stat_by_user.csv')

In [18]:
problems_seen = go.Scatter(x=dh.n_show_answer_problem_seen,y=dh.n_problems_seen,name='problems_seen',text=dg.username, mode = 'markers')
not_attempted = go.Scatter(x=dh.n_show_answer_not_attempted,y=dh.n_not_attempted,name='not_attempted',text=dh.user_id,mode = 'markers')
attempted = go.Scatter(x=dh.n_show_answer_attempted,y=dh.n_attempted,name='attempted',text=dh.user_id, mode = 'markers')

data = [problems_seen,not_attempted,attempted]
layout = go.Layout(title="n_show_answer",
                   yaxis=dict(title='problems'),
                   xaxis=dict(title='n_show_answer'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [19]:
di = pd.read_csv('sample_data/data_person_problem_wide.csv')

In [20]:
trace0 = go.Scatter(x=di.y1,text=df.username,name='y1', mode = 'markers')
trace1 = go.Scatter(x=di.y2,text=df.username,name='y2', mode = 'markers')
trace2 = go.Scatter(x=di.y3,text=df.username,name='y3', mode = 'markers')

data = [trace0,trace1,trace2]

layout = go.Layout(title="problem_score",
                   xaxis=dict(title='pct'),
                   yaxis=dict(title='user_index'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [21]:
dj = pd.read_csv('sample_data/data_chapter_grades.csv')

In [22]:
y0 = (dg.total_problem_time_30/dg.total_time_30)*100
y1 = (dg.total_video_time_30/dg.total_time_30)*100
y2 = (dg.total_text_time_30/dg.total_time_30)*100


trace = go.Scatter(x=dg.username,y=y0,name='problem_time')
trace0 = go.Scatter(x=dg.username,y=y1,name='video_time')
trace1 = go.Scatter(x=dg.username,y=y2,name='text_time')

data = [trace,trace0,trace1]

layout = go.Layout(title="problem_time, video_time, text_time as a percentage of total_time",
                   xaxis=dict(title='username'),
                   yaxis=dict(title='time'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [23]:
y0 = dg.total_time_30

data = [go.Scatter(x=dg.username,y=y0,name='time')]

layout = go.Layout(title="total_time",
                   xaxis=dict(title='username'),
                   yaxis=dict(title='time'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [71]:
np_dg = np.array(dg)

for x in np_dg:
    if x[1] == 'bz3kucs':
        print('total_time: ',x[3])
        print('total_video_time: ',x[5])
        print('total_problem_time: ',x[8])
        print('total_text_time: ',x[14])
        values = [x[5],x[8],x[14]]


labels = ['Video','Problem','Text']
print(values)

trace = go.Pie(labels=labels, values=values)
data = [trace]

layout = go.Layout(title="time breakdown",
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)
    

total_time:  145509.04692000002
total_video_time:  7508.982234999998
total_problem_time:  106860.588111
total_text_time:  570.3559769999999
[7508.982234999998, 106860.588111, 570.3559769999999]


In [83]:
dk = pd.read_csv('sample_data/data_time_on_task.csv')

In [142]:
np_dk = np.array(dk)
dates = []
times = []
problems = []

for x in np_dk:
    if x[2] == 'mmc645a':
        dates.append(x[1])
        times.append(x[4])
        problems.append(x[9])
               
trace0 = go.Scatter(x=dates,y=times, mode = 'lines+markers')
trace1 = go.Scatter(x=dates,y=problems, mode = 'lines+markers')

data = [trace0,trace1]
layout = go.Layout(title="daily time distribution for user mmc645a",
                   xaxis=dict(title='dates'),
                   yaxis=dict(title='time in seconds'),
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)
        


In [144]:
np_df = np.array(df)
#user_id = 534752
user_id = 0

for x in np_df:
    print(str(x[2]))
    if str(x[2]) == 'mmc645a':
        user_id = x[1]
        print(user_id)
    


 tqaecu0
 4x7osj4
 2nzr9jt
 rzqyxpe
 kjf0lxg
 jwbcegz
 2kg15cn
 12r2pjl
 p63f79a
 38v6e5p
 i68dtx9
 maaq8x3
 tt4jhv9
 2z7m4k1
 1m8mtrf
 ezvscbm
 wtgqhwf
 syo0e1l
 tdew7jt
 tfdnmoa
 2bj9zbd
 4q0onuq
 x4z7d9g
 eba1pfy
 q1w27ms
 fgx1yq1
 09nv0qv
 e27cz84
 m4pgmzl
 vzjekyz
 voxh1uw
 4yrah8c
 eqzjx1j
 nd7xn4z
 qwdo51o
 ky1u9wn
 iftqayo
 6qcvnh2
 norfgc5
 ala29iv
 r676xfh
 f6eyjxn
 ieb5p9a
 tkppsk0
 e12q2s3
 2s1ogkh
 ww6ocg6
 3fd1cm1
 mg07yjz
 a4g0pjk
 e8356rk
 g8x409s
 achojpb
 0bxbgi7
 8e9fw4i
 iyphpbl
 frs5vyw
 rjfv5uz
 jf7bmup
 xolejw8
 3eos4mp
 ljf5k84
 pfgr8f9
 u5gdccb
 b432civ
 krr0wu2
 v69e961
 9eanrv4
 l9ki38g
 17297jh
 8bj0xm9
 0beasht
 k08v4u8
 zy687ei
 k8n3vu2
 624lyg7
 q1xz0ms
 nklrc2g
 hugo4q5
 setlup0
 d7056z5
 a3jm0iq
 p5tymy9
 qj6powg
 c3so82e
 bp8dli3
 xkvl742
 d1h2hin
 0srzp8o
 kud1bfy
 blkvs0o
 nvrze3k
 h2lcazt
 xh6teip
 zhyczno
 4hrbzpf
 5hjwmd3
 etj0kup
 sgrpwhh
 oyp8cwz
 bkgx58f
 hazla5j
 d9c4mbs
 nc4r8or
 ld1u03p
 5sdzsxk
 ehk78xs
 ikgzqpi
 bklnb62
 aps7l82
 xvl37tx
 