In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
assessments_raw = pd.read_csv("dataset/assessments.csv")
courses_raw = pd.read_csv("dataset/courses.csv")
student_assessment_raw = pd.read_csv("dataset/studentAssessment.csv")
student_info_raw = pd.read_csv("dataset/studentInfo.csv")
student_registration_raw = pd.read_csv("dataset/studentRegistration.csv")
# If this is failing, ensure that you have downloaded the dataset and copied studentVle.csv 
# to the dataset folder.
student_vle_raw = pd.read_csv("dataset/studentVle.csv") 
vle_raw = pd.read_csv("dataset/vle.csv")

In [4]:
print(student_assessment_raw.columns)
print(assessments_raw.columns)


Index(['id_assessment', 'id_student', 'date_submitted', 'is_banked', 'score'], dtype='object')
Index(['code_module', 'code_presentation', 'id_assessment', 'assessment_type',
       'date', 'weight'],
      dtype='object')


In [5]:
# read in existing wideform data
df_wide = pd.read_excel('wideform.xlsx')
df_wide.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,highest_education,age_band,num_of_prev_attempts,studied_credits,disability,final_result,...,subpage_16-30,subpage_31-45,subpage_46-60,url_pre-0,url_1-15,url_16-30,url_31-45,url_46-60,sum_click_pre_A1,sum_click_pre_A2
0,AAA,2013J,11391,0,3,2,0,240,0,0,...,2,1,0,0,1,0,0,0,352,175
1,AAA,2013J,28400,1,3,1,0,60,0,0,...,16,1,1,10,9,11,1,1,493,156
2,AAA,2013J,31604,1,2,1,0,60,0,0,...,17,17,3,3,10,8,12,3,390,371
3,AAA,2013J,32885,1,1,0,0,60,0,0,...,8,0,1,3,0,3,0,0,558,64
4,AAA,2013J,38053,0,2,1,0,60,0,0,...,11,3,3,1,4,6,2,3,614,395


In [7]:
#weighted_score_A1
#weighted_score_A2
#df_wide[['weighted_score_A1','weighted_score_A2']].head()

In [8]:
# create for each module-presentation the weight for the first two TMAs
tma_assessments = assessments_raw[assessments_raw['assessment_type']=='TMA']
module_presentations = tma_assessments[['code_module','code_presentation']].drop_duplicates()
modules = tma_assessments[['code_module']].drop_duplicates()

df_first_two_tmas = tma_assessments.sort_values(by='date').groupby(['code_module','code_presentation']).head(2).sort_values(by=['code_module','code_presentation'])
weighted_possible_scores = df_first_two_tmas[['code_module','code_presentation','weight']].groupby(['code_module','code_presentation']).sum().reset_index()
weighted_possible_scores

Unnamed: 0,code_module,code_presentation,weight
0,AAA,2013J,30.0
1,AAA,2014J,30.0
2,BBB,2013B,23.0
3,BBB,2013J,23.0
4,BBB,2014B,23.0
5,BBB,2014J,10.0
6,CCC,2014B,31.0
7,CCC,2014J,31.0
8,DDD,2013B,17.5
9,DDD,2013J,22.5


In [15]:
#df_first_two_tmas
# tmas per module
#tma_assessments.groupby(['code_module','code_presentation']).count()
tma_assessments[['code_module','code_presentation','date']].drop_duplicates()

Unnamed: 0,code_module,code_presentation,date
0,AAA,2013J,19.0
1,AAA,2013J,54.0
2,AAA,2013J,117.0
3,AAA,2013J,166.0
4,AAA,2013J,215.0
...,...,...,...
193,GGG,2014B,117.0
194,GGG,2014B,166.0
202,GGG,2014J,61.0
203,GGG,2014J,124.0


In [17]:
cma_assessments = assessments_raw[assessments_raw['assessment_type']=='CMA']
print(cma_assessments.shape)
cma_assessments[['code_module','code_presentation','date']].drop_duplicates().shape

(76, 6)


(37, 3)

In [39]:
# do basic train-test split (80/20)
from sklearn.model_selection import train_test_split
df = df_wide.copy()
weighted_possible_scores = weighted_possible_scores.rename(columns={'weight':'out_of'})
df = df.merge(weighted_possible_scores, on=['code_module','code_presentation'])

df['grade'] = (df['weighted_score_A1'] + df['weighted_score_A2'])/(df['out_of'])

rule_based_columns = ['code_module','code_presentation','id_student','grade']
label = ['final_result']

X_train, X_test, y_train, y_test = train_test_split(df[rule_based_columns],df[label], test_size=0.2, random_state=42)

Unnamed: 0,code_module,code_presentation,id_student,grade
1705,BBB,2013B,553320,0.147826
2856,BBB,2013J,573487,0.119565
21810,FFF,2014J,652753,0.92
476,AAA,2014J,303521,0.796667
21800,FFF,2014J,652576,0.79


In [69]:
# test rule based method
def pred_rule_based(df):
    df['pred'] = df.apply(lambda x: 0 if x['grade']>0.4 else 1, axis=1)
    return df

train_preds = pred_rule_based(X_train)
train_preds['actual'] = y_train

In [43]:
# test scoring:
from sklearn.metrics import accuracy_score

In [70]:
score = accuracy_score(y_train,train_preds['pred'])
score

0.6436514522821577

In [81]:
# find accuracy by module
test_preds = pred_rule_based(X_test)
test_preds['actual'] = y_test

# print(test_preds.head())

accuracies = {}
for _,mp in module_presentations.iterrows():
    # select only for this course
    df_ = test_preds[(test_preds['code_module']==mp['code_module']) & (test_preds['code_presentation']==mp['code_presentation'])]
    accuracies[(mp['code_module'],mp['code_presentation'])] = np.round(accuracy_score(df_['actual'], df_['pred']),3)

accuracies

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


{('AAA', '2013J'): 0.843,
 ('AAA', '2014J'): 0.743,
 ('BBB', '2013B'): 0.645,
 ('BBB', '2013J'): 0.696,
 ('BBB', '2014B'): 0.68,
 ('BBB', '2014J'): 0.668,
 ('CCC', '2014B'): 0.652,
 ('CCC', '2014J'): 0.592,
 ('DDD', '2013B'): 0.598,
 ('DDD', '2013J'): 0.608,
 ('DDD', '2014B'): 0.584,
 ('DDD', '2014J'): 0.605,
 ('EEE', '2013J'): 0.851,
 ('EEE', '2014B'): 0.708,
 ('EEE', '2014J'): 0.747,
 ('FFF', '2013B'): 0.616,
 ('FFF', '2013J'): 0.664,
 ('FFF', '2014B'): 0.549,
 ('FFF', '2014J'): 0.636,
 ('GGG', '2013J'): nan,
 ('GGG', '2014B'): nan,
 ('GGG', '2014J'): nan}

In [75]:
test_preds[(test_preds['code_module']=='EEE') & (test_preds['code_presentation']=='2013J')].shape

(175, 6)

In [77]:
all_preds = pd.concat([train_preds, test_preds])
accuracies_all = {}
for _,mp in module_presentations.iterrows():
    # select only for this course
    df_ = all_preds[(all_preds['code_module']==mp['code_module']) & (all_preds['code_presentation']==mp['code_presentation'])]
    accuracies_all[(mp['code_module'],mp['code_presentation'])] = np.round(accuracy_score(df_['actual'], df_['pred']),3)

accuracies_all

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


{('AAA', '2013J'): 0.791,
 ('AAA', '2014J'): 0.774,
 ('BBB', '2013B'): 0.648,
 ('BBB', '2013J'): 0.687,
 ('BBB', '2014B'): 0.661,
 ('BBB', '2014J'): 0.687,
 ('CCC', '2014B'): 0.605,
 ('CCC', '2014J'): 0.612,
 ('DDD', '2013B'): 0.572,
 ('DDD', '2013J'): 0.603,
 ('DDD', '2014B'): 0.585,
 ('DDD', '2014J'): 0.576,
 ('EEE', '2013J'): 0.791,
 ('EEE', '2014B'): 0.738,
 ('EEE', '2014J'): 0.758,
 ('FFF', '2013B'): 0.608,
 ('FFF', '2013J'): 0.631,
 ('FFF', '2014B'): 0.582,
 ('FFF', '2014J'): 0.643,
 ('GGG', '2013J'): nan,
 ('GGG', '2014B'): nan,
 ('GGG', '2014J'): nan}

In [78]:
# accuracy on all
np.round(accuracy_score(all_preds['actual'],all_preds['pred']),3)

0.646

In [82]:
accuracies = {}
for _,mp in modules.iterrows():
    df_ = test_preds[(test_preds['code_module']==mp['code_module'])]
    accuracies[mp['code_module']] = np.round(accuracy_score(df_['actual'], df_['pred']),3)

accuracies

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


{'AAA': 0.793,
 'BBB': 0.675,
 'CCC': 0.615,
 'DDD': 0.6,
 'EEE': 0.776,
 'FFF': 0.623,
 'GGG': nan}

In [84]:
accuracies_all = {}
for _,mp in modules.iterrows():
    df_ = all_preds[(all_preds['code_module']==mp['code_module'])]
    accuracies_all[mp['code_module']] = np.round(accuracy_score(df_['actual'], df_['pred']),3)

accuracies_all

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


{'AAA': 0.783,
 'BBB': 0.673,
 'CCC': 0.61,
 'DDD': 0.586,
 'EEE': 0.765,
 'FFF': 0.62,
 'GGG': nan}