# code separated out into code blocks

add comments from other files

In [1]:


zipfile_ou = '../data/anonymisedData.zip'
prediction_point = 200

import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt




In [3]:


def load_data(zip_file_path):
    '''Loads the data from the Open University Learning Analytics dataset zip file.'''
    
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        registrations = pd.read_csv(zip_file.open('studentRegistration.csv'))
        courses = pd.read_csv(zip_file.open('courses.csv'))
        students = pd.read_csv(zip_file.open('studentInfo.csv'))
        student_vle = pd.read_csv(zip_file.open('studentVle.csv'))
        vle = pd.read_csv(zip_file.open('vle.csv'))
        student_assessments = pd.read_csv(zip_file.open('studentAssessment.csv'))
        assessments = pd.read_csv(zip_file.open('assessments.csv'))
    
    return registrations, courses, students, student_vle, vle, student_assessments, assessments






In [70]:
registrations, courses, students, student_vle, vle, student_assessments, assessments = load_data(zipfile_ou)

In [71]:

# merge students and courses
student_regist = pd.merge(students, courses, on=['code_module', 'code_presentation'], validate='many_to_one')

# merge registrations
student_regist = pd.merge(student_regist, registrations, on=['code_module', 'code_presentation', 'id_student'], how = 'left', validate='1:1')


In [72]:

#prediction_point default
if prediction_point is None:
    prediction_point = max(student_regist['module_presentation_length'])


In [73]:

# drop missing value rows (date_registration, imd_band)
student_regist.dropna(subset=['date_registration', 'imd_band'], inplace=True)


In [74]:

# prediction point must be less than course length, integer, and greater than 0
if not isinstance(prediction_point, int) or prediction_point <= 0 or prediction_point >= max(student_regist['module_presentation_length']):
    print("Error: Invalid prediction point. \n\nPlease provide an integer value greater than 0 and less than the maximum course length. \n\nThis is the number of days from the start of the course for which you want to predict the outcome.")
else:
    # withdrawn or failed before prediction point - remove
    withdrawn_fail_condition = (student_regist['final_result'].isin(['Withdrawn', 'Fail'])) & (student_regist['date_unregistration'] <= prediction_point)
    student_regist.loc[withdrawn_fail_condition, 'status'] = 'remove_outcome_known'
    # if unregister after prediction point - keep
    unregister_after_condition = student_regist['date_unregistration'] > prediction_point
    student_regist.loc[unregister_after_condition, 'status'] = 'keep'
    # if no unregistration date - keep
    no_unregistration_condition = student_regist['date_unregistration'].isna()
    student_regist.loc[no_unregistration_condition, 'status'] = 'keep'
    # default case
    student_regist.loc[~(withdrawn_fail_condition | unregister_after_condition | no_unregistration_condition), 'status'] = 'query'

    # rows which need investigation
    query_rows = student_regist[student_regist['status'] == 'query'] | student_regist[student_regist['status'].isna()]


In [75]:
query_rows.info()
not query_rows.empty

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   code_module                 0 non-null      bool 
 1   code_presentation           0 non-null      bool 
 2   id_student                  0 non-null      int64
 3   gender                      0 non-null      bool 
 4   region                      0 non-null      bool 
 5   highest_education           0 non-null      bool 
 6   imd_band                    0 non-null      bool 
 7   age_band                    0 non-null      bool 
 8   num_of_prev_attempts        0 non-null      int64
 9   studied_credits             0 non-null      int64
 10  disability                  0 non-null      bool 
 11  final_result                0 non-null      bool 
 12  module_presentation_length  0 non-null      int64
 13  date_registration           0 non-null      bool 
 14  date_unregistration   

False

In [76]:

# print rows which need investigation
if not query_rows.empty:
    print("The following rows need investigation.  They are excluded from the following analysis: \n")
    print(query_rows)
    student_regist = student_regist[~student_regist.isin(query_rows)].dropna()


In [86]:
student_regist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31437 entries, 0 to 32592
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 31437 non-null  object 
 1   code_presentation           31437 non-null  object 
 2   id_student                  31437 non-null  int64  
 3   gender                      31437 non-null  object 
 4   region                      31437 non-null  object 
 5   highest_education           31437 non-null  object 
 6   imd_band                    31437 non-null  object 
 7   age_band                    31437 non-null  object 
 8   num_of_prev_attempts        31437 non-null  int64  
 9   studied_credits             31437 non-null  int64  
 10  disability                  31437 non-null  object 
 11  final_result                31437 non-null  object 
 12  module_presentation_length  31437 non-null  int64  
 13  date_registration           314

In [78]:

# replace missing date_unreg with module_presentation_length
student_regist['date_unregistration'] = student_regist['date_unregistration'].fillna(student_regist['module_presentation_length'])


In [85]:



# drop students who unregistered before registering
student_regist = student_regist[student_regist['date_unregistration'] >= student_regist['date_registration']]



In [87]:

# remove rows from final student df which are not needed
model_final = student_regist[student_regist['status'] != 'remove_outcome_known']
   





In [90]:
model_final.count()
#model_final['status'].unique()

code_module                   22149
code_presentation             22149
id_student                    22149
gender                        22149
region                        22149
highest_education             22149
imd_band                      22149
age_band                      22149
num_of_prev_attempts          22149
studied_credits               22149
disability                    22149
final_result                  22149
module_presentation_length    22149
date_registration             22149
date_unregistration           22149
status                        22149
condition                     22149
dtype: int64

In [93]:

 
# merge 'assessments' and 'courses' on 'code_module' and 'code_presentation'
course_assess = pd.merge(assessments, courses, on=['code_module', 'code_presentation'], how='left')


In [94]:
assessments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code_module        206 non-null    object 
 1   code_presentation  206 non-null    object 
 2   id_assessment      206 non-null    int64  
 3   assessment_type    206 non-null    object 
 4   date               195 non-null    float64
 5   weight             206 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 9.8+ KB


In [None]:

    # fill in the missing 'date' values with course final week (as per literature)
    value_to_fill = course_assess['module_presentation_length'] - 3
    course_assess['date'] = course_assess['date'].fillna(value_to_fill)

    # merge student_assessments with course_assess
    stu_assess = pd.merge(student_assessments, course_assess, on=['id_assessment'], how='left')

    # drop students who have no score for an assessment
    missing_score_rows = stu_assess[stu_assess['score'].isna()]

    if not missing_score_rows.empty:
        print("The following students have missing 'scores'. They are excluded from the following analysis: \n")
        print(missing_score_rows)

        # student ids with missing score
        unique_ids_missing = missing_score_rows['id_student'].unique()

        # remove students with missing score from model_final
        model_final = model_final[~model_final['id_student'].isin(unique_ids_missing)]

        # drop rows with missing score
        stu_assess.dropna(subset=['score'], inplace=True)

    # remove students with negative or extreme date_submitted values
    max_module_length = stu_assess['module_presentation_length'].max()
    greater_than_max_length = stu_assess[stu_assess['date_submitted'] > max_module_length]['id_student'].unique()
    less_than_zero = stu_assess[stu_assess['date_submitted'] < 0]['id_student'].unique()
    students_to_remove = set(greater_than_max_length) | set(less_than_zero)
    model_final = model_final[~model_final['id_student'].isin(students_to_remove)]
    stu_assess = stu_assess[~stu_assess['id_student'].isin(students_to_remove)]

    # reduce data by prediction point
    model_student_assess = stu_assess[stu_assess['date'] <= prediction_point]

    # get expected assessment details
    model_course_assess = course_assess[course_assess['date'] <= prediction_point]
    expected_ass = model_course_assess.groupby(['code_module', 'code_presentation'])['id_assessment'].count().reset_index()
    expected_ass = expected_ass.rename(columns={'id_assessment': 'exp_sub_count'})
    date_sum = model_course_assess.groupby(['code_module', 'code_presentation'])['date'].sum().reset_index()
    expected_ass = expected_ass.merge(date_sum, on=['code_module', 'code_presentation'], how='left')
    expected_ass = expected_ass.rename(columns={'date': 'exp_sub_date_sum'})

    # summarise students' assessments by module_presentation
    student_assessment_summary = model_student_assess.groupby(['id_student', 'code_module', 'code_presentation']).agg(
        count_id_assessment=('id_assessment', 'count'),
        sum_score=('score', 'sum'),
        sum_date=('date', 'sum')
    ).reset_index()

    # merge student_assessment_summary and expected_ass on code_module and code_presentation
    merged_assess_summary = student_assessment_summary.merge(expected_ass, on=['code_module', 'code_presentation'])

    # calculate the new features
    merged_assess_summary['prop_submissions'] = merged_assess_summary['count_id_assessment'] / merged_assess_summary['exp_sub_count']
    merged_assess_summary['avg_score'] = merged_assess_summary['sum_score'] / merged_assess_summary['exp_sub_count']
    merged_assess_summary['submission_distance'] = merged_assess_summary['exp_sub_date_sum'] - merged_assess_summary['sum_date']

    # merge with 'model_final'
    model_final = model_final.merge(merged_assess_summary, on=['id_student', 'code_module', 'code_presentation'], how='inner')

    return model_final

In [10]:
def add_vle_data(model_final, student_vle, vle, courses, prediction_point=None):
    '''Returns updated model_final dataframe with vle data:

    prediction_point is an integer representing the number of days from the start of the course for which you want to predict the outcome.

    default is max(student_regist['module_presentation_length']) - that is, the last da of the lengthiest course

    - merge 'vle' and 'courses' and 'student_vle'
    - remove columns (week_from, week_to)
    - filter the rows where 'date' <= 'prediction_point'
    - create new features: vle_activity_count, student vle type count, student total clicks, student days active (engaged)
    
    '''
    # merge 'vle' and 'courses' on 'code_module' and 'code_presentation'
    course_vle = vle.merge(courses, on=['code_module', 'code_presentation'], how='left').drop(['week_from', 'week_to'], axis=1)

    # merge vle with student_vle
    all_stu_vle = pd.merge(student_vle, course_vle, on=['id_site', 'code_module', 'code_presentation'], how='left')

    # filter the rows where 'date' <= 'prediction_point'
    all_stu_vle = all_stu_vle[all_stu_vle['date'] <= prediction_point]

    # filter the rows where 'date' is greater than 'module_presentation_length'
    vle_after_done = all_stu_vle[all_stu_vle['date'] > all_stu_vle['module_presentation_length']]

    if not vle_after_done.empty:
        print("The following rows need investigation. They are excluded from the following analysis: \n")
        print(vle_after_done)

        # match rows based on 'code_module', 'code_presentation', and 'id_student'
        matching_rows = model_final[model_final[['code_module', 'code_presentation', 'id_student']].isin(vle_after_done).all(axis=1)]

        # remove the matching rows from 'model_final'
        model_final = model_final[~model_final.index.isin(matching_rows.index)]

    # aggregations for each column
    aggregations = {
        'id_site': 'count',
        'activity_type': 'nunique',
        'sum_click': 'sum',
        'date': lambda x: x.nunique()
    }

    # group and apply the aggregations
    grouped_stu_vle = all_stu_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(aggregations).reset_index()

    # rename the columns
    grouped_stu_vle.rename(columns={
        'id_site': 'stu_activity_count',
        'activity_type': 'stu_activity_type_count',
        'sum_click': 'stu_total_clicks',
        'date': 'stu_days_active'
    }, inplace=True)

    # number of vle types per module_presentation
    group_vle = vle.groupby(['code_module', 'code_presentation'])['activity_type'].nunique().reset_index()
    group_vle.rename(columns={'activity_type': 'mod_pres_vle_type_count'}, inplace=True)

    # merge 'grouped_stu_vle' and 'group_vle' on 'code_module' and 'code_presentation'
    merged_vle_summary = grouped_stu_vle.merge(group_vle, on=['code_module', 'code_presentation'], how='left')

    # merge with 'model_final'
    model_final = model_final.merge(merged_vle_summary, on=['id_student', 'code_module', 'code_presentation'], how='inner')

    return model_final


In [11]:
def tidy_up_model_final(model_final):
    '''Returns updated model_final dataframe:
    - deleted unnecessary columns
    - reordered columns
    - added subject, year, month columns
    
    '''
    # new columns for module_presentation - subject, year, month
    model_final['year'] = model_final['code_presentation'].str[:4].astype(int)
    model_final['month'] = model_final['code_presentation'].str[-1].map({'J': 'Oct', 'B': 'Feb'})

    # module subject mapping
    code_module_mapping = {
        'AAA': 'SocSci',
        'BBB': 'SocSci',
        'GGG': 'SocSci',
        'CCC': 'Stem',
        'DDD': 'Stem',
        'EEE': 'Stem',
        'FFF': 'Stem'
    }
    model_final['subject'] = model_final['code_module'].map(code_module_mapping)

    # rename 'module_presentation_length' to 'course_length'
    model_final.rename(columns={'module_presentation_length': 'course_length'}, inplace=True)

    # combine 'code_module', 'code_presentation', and 'id_student' into 'mod_pres_stu'
    model_final['mod_pres_stu'] = model_final['code_module'] + '-' + model_final['code_presentation'] + '-' + model_final['id_student'].astype(str)

    # drop the separate columns 'code_module', 'code_presentation', and 'id_student'
    model_final.drop(columns=['code_module', 'code_presentation', 'id_student'], inplace=True)

    # move 'final_result' to the last column
    final_result_column = model_final.pop('final_result')
    model_final['final_result'] = final_result_column

    # insert 'mod_pres_stu' as the first column
    mod_pres_stu_column = model_final.pop('mod_pres_stu')
    model_final.insert(0, 'mod_pres_stu', mod_pres_stu_column)

    # drop columns: 'count_id_assessment', 'sum_score', 'sum_date', 'exp_sub_count', 'exp_sub_date_sum'
    model_final.drop(columns=['count_id_assessment', 'sum_score', 'sum_date', 'exp_sub_count', 'exp_sub_date_sum'], inplace=True)

    return model_final


In [15]:

zipfile_ou = '../data/anonymisedData.zip'
prediction_point = 200

import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt

In [30]:
def process_data(zip_file, prediction_point):
    # Load data from the zip file
    registrations, courses, students, student_vle, vle, student_assessments, assessments = load_data(zip_file)

    # Process student data
    model_final = student_data(students, courses, registrations, prediction_point)

    # Add assessments to model_final
    model_final = add_assessments(model_final, student_assessments, assessments, courses, prediction_point)

    # Add VLE data to model_final
    model_final = add_vle_data(model_final, student_vle, vle, courses, prediction_point)

    # Tidy up model_final
    model_final = tidy_up_model_final(model_final)

    return model_final




In [31]:
process_data(zipfile_ou, prediction_point)

ValueError: max() arg is an empty sequence

add default for prediction point, ie. if none