In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
stuvle = pd.read_csv('data/stvl_fff14j.csv')
stuas = pd.read_csv('data/stas_fff14j.csv')
assess = pd.read_csv('data/ass_fff14j.csv')

In [3]:
stuinfo = pd.read_csv('data/stin_fff14j.csv')
stureg = pd.read_csv('data/strg_fff14j.csv')

# Clean data

In [4]:
stuvle.drop(['code_module', 'code_presentation'], axis=1, inplace=True)

In [5]:
stuvle.head()

Unnamed: 0,id_student,id_site,date,sum_click
0,2398260,883041,-18,1
1,2398260,882602,-18,1
2,2398260,882587,-18,2
3,2398260,883142,-18,1
4,2398260,883092,-18,1


In [6]:
len(stuvle)

1210359

In [7]:
stuvle_grpd = stuvle.groupby(['id_student', 'id_site', 'date'])

In [8]:
len(stuvle_grpd)

965223

In cell below, we see that contrary to official doc, the rows are per session, rather than per day.

In [57]:
len(stuvle_grpd.count()[stuvle_grpd.count()['sum_click'] > 1])

179402

Creating df with one row for each day...

In [13]:
stuvle_daily = stuvle.groupby(['id_student', 'id_site', 'date']).sum()

In [15]:
stuvle_daily.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum_click
id_student,id_site,date,Unnamed: 3_level_1
2697921,973777,212,3
2697921,973777,224,16
2697921,973777,230,21
2697921,973777,240,38
2697921,973777,241,29


In [16]:
stuvle_daily.to_csv('data/stvl_fff14j_daily.csv', index=False, header=True)

In [27]:
len(stuvle)

1210359

In [28]:
len(stuvle_daily)

965223

#### Assessments

In [65]:
stuas.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,34899,560047,24,0,84.0
1,34899,560459,29,0,48.0
2,34899,560645,23,0,100.0
3,34899,560765,23,0,78.0
4,34899,561231,22,0,80.0


In [19]:
stuas_cnt = stuas.groupby(['id_student', 'id_assessment']).count()

Next two cells show that: 1) no students transferred a score from earlier presentation, and 2) no students submitted an assessment more than once. (most likely are not able to)

In [21]:
stuas_cnt[stuas_cnt['is_banked'] > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,date_submitted,is_banked,score
id_student,id_assessment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [22]:
stuas_cnt[stuas_cnt['date_submitted'] > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,date_submitted,is_banked,score
id_student,id_assessment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [68]:
assess

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,FFF,2014J,34904,CMA,241.0,0.0
1,FFF,2014J,34905,CMA,241.0,0.0
2,FFF,2014J,34906,CMA,241.0,0.0
3,FFF,2014J,34907,CMA,241.0,0.0
4,FFF,2014J,34908,CMA,241.0,0.0
5,FFF,2014J,34910,CMA,241.0,0.0
6,FFF,2014J,34909,CMA,241.0,0.0
7,FFF,2014J,34899,TMA,24.0,12.5
8,FFF,2014J,34900,TMA,52.0,12.5
9,FFF,2014J,34901,TMA,94.0,25.0


Note that the final exam is id no. 34911

In [73]:
stuas[stuas.id_assessment.isin(['34911'])]

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score


In [74]:
stuas.groupby(['id_assessment']).count()

Unnamed: 0_level_0,id_student,date_submitted,is_banked,score
id_assessment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34899,1826,1826,1826,1824
34900,1601,1601,1601,1598
34901,1398,1398,1398,1398
34902,1307,1307,1307,1304
34903,1137,1137,1137,1136
34904,1454,1454,1454,1454
34905,1363,1363,1363,1363
34906,1254,1254,1254,1254
34907,1234,1234,1234,1234
34908,1213,1213,1213,1213


And the final exam is apparently not being used in this course.

In [88]:
stuas_stusite = stuas.groupby(['id_student', 'id_assessment'])

In [95]:
stuas_stusite.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_submitted,is_banked,score
id_student,id_assessment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31296,34899,1,1,1
31296,34900,1,1,1
31296,34901,1,1,1
31296,34902,1,1,1
31296,34903,1,1,1
31296,34904,1,1,1
31296,34905,1,1,1
31296,34906,1,1,1
31296,34907,1,1,1
31296,34908,1,1,1


In [100]:
stuas.groupby('date_submitted', 'id_assessment').count()

ValueError: No axis named id_assessment for object type <class 'type'>

# Basic statistics

How many students dropped out or failed? How many took it twice, thrice, etc?

# Basic logistic regression

In [75]:
stuas.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,34899,560047,24,0,84.0
1,34899,560459,29,0,48.0
2,34899,560645,23,0,100.0
3,34899,560765,23,0,78.0
4,34899,561231,22,0,80.0
