In [23]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

**Available functions:**

`create_student_df()`: Returns merged studentInfo and studentRegistration with chosen features.

`create_assessment_df()`: Returns merged assessments and studentAssessment with chosen features. Currently set to return the first two assessments from courses AAA-FFF.

`wide_form_sa()`: Takes the dataframe returned by `create_assessment_df()` and returns a wide-form variant.

`create_si_sa_df()`: Returns merged dataframes created by `create_student_df()` and `wide_form_sa()`.

`create_vle_df()`: Returns merged vle and studentVle with chosen features. Currently set to return everything on or before day 60 (including pre-course days), binned by 15 day intervals, from courses AAA-FFF. Currently returning ONLY `forumng` activity for run speed, until we nail down which activities to focus on.

`wide_form_vle()`: Takes the dataframe returned by `create_vle_df()` and returnsa  wide-form varient.

`create_si_sa_vle_df()`: Returns merged dataframes created by `create_si_sa_df()` and `wide_form_vle` for a full index-features dataframe


`assessment_namer(id_assessment)`: Helper function for `create_assessment_df()`

In [24]:
def create_student_df():
    # Read in .csv's to be joined
    studentInfo = pd.read_csv('dataset/studentInfo.csv')
    studentRegistration = pd.read_csv('dataset/studentRegistration.csv')

    # Drop any unnecessary columns (can be edited for feature extraction)
    si_drop_cols = ['gender', 'region', 'highest_education', 'imd_band', 'disability']
    si = studentInfo.drop(columns=si_drop_cols)

    # Merge, inner join
    student_df = pd.merge(si, studentRegistration, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return student_df


In [25]:
def assessment_namer(id_assessment):
    Assess_1s = [1752, 1758, 14984, 14996, 15008, 15020, 24282, 24291, 25334, 25348, 25355, 25362, 30709, 30714, 30719, 34860, 34873, 34886, 34899]
    Assess_2s = [1753, 1759, 14985, 14997, 15009, 15021, 24283, 24292, 25335, 25349, 25356, 25363, 30710, 30715, 30720, 34861, 34874, 34887, 34900]

    if id_assessment in Assess_1s:
        return "A1"
    elif id_assessment in Assess_2s:
        return "A2"
    else:
        return "Drop"


In [26]:
def create_assessment_df():
    # Read in .csv's to be joined
    assessments = pd.read_csv('dataset/assessments.csv')
    studentAssessment = pd.read_csv('dataset/studentAssessment.csv')

    # Drop any unnecessary columns
    sa_drop_cols = ['is_banked']
    sa = studentAssessment.drop(columns=sa_drop_cols)

    # Merge, left join
    assessment_df = pd.merge(sa, assessments, how='left', on='id_assessment')

    # Dropping 'assessment_type' == 'CMA' due to extreme course differences
    # Also dropping the final exam because we want to catch students BEFORE then
    a_filtered = assessment_df[assessment_df.assessment_type == 'TMA']

    # Mapping with assessment_namer
    a_filtered['assessment_name'] = a_filtered['id_assessment'].apply(assessment_namer)
    a_filtered = a_filtered[a_filtered.assessment_name != 'Drop']
    

    return a_filtered



In [27]:
def wide_form_sa():
    sa = create_assessment_df()

    # Combine date submitted and date into -/+
    sa['days_from_due'] = sa['date_submitted'] - sa['date']

    # Drop Assessment Type, Weight
    sa.drop(columns=['assessment_type', 'weight', 'date_submitted', 'date', 'id_assessment'], inplace=True)

    inds = ['id_student', 'code_module', 'code_presentation']
    vals = ['score', 'days_from_due']

    sa_wide = sa.pivot_table(
                values = vals,
                columns = 'assessment_name',
                index = inds
    )

    
    sa_wide.columns = ["_".join(a) for a in sa_wide.columns.to_flat_index()]
    sa_wide.reset_index(inplace=True)

    return sa_wide


In [28]:
def create_si_sa_df():
    si = create_student_df()
    sa = wide_form_sa()

    # Merge
    si_sa_df = pd.merge(si, sa, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return si_sa_df

In [29]:
def create_vle_df():
    studentVle = pd.read_csv('dataset/studentVle.csv')
    vle = pd.read_csv('dataset/vle.csv')

    # Dropping unused columns from raw
    vle.drop(columns=['week_from', 'week_to'], inplace=True)

    #Merge - this primarily associates 'id_site' with a more narrative description
    vle_merged = pd.merge(studentVle, vle, how='left', on=['code_module', 'code_presentation', 'id_site'])
    vle_merged.drop(columns='id_site', inplace=True)

    # Dropping GGG
    vle_no_ggg = vle_merged[vle_merged.code_module != 'GGG']

    # Dropping activity types we're less interested in
    # For now, I'm only using 'forumng' for testing purposes
    # EDIT 07/19/2022, running without this to make a biiiig xlsx so we can see what's most important
    #vle_subset = vle_no_ggg[vle_no_ggg.activity_type == 'forumng']
    vle_subset = vle_no_ggg

    # Dropping after certain date (Adjust as desired!)
    date_max = 60
    vle_df = vle_subset[vle_subset.date <= date_max]

    # Creating bin columns
    # Set bin parameters (Also adjust as desired!)
    bin_vals = [-15, 0, 15, 30, 45, 60]
    bin_labels = ['pre-0', '1-15', '16-30', '31-45', '46-60']
    vle_df['bin'] = pd.cut(vle_df['date'], bins=bin_vals, labels=bin_labels)
    vle_df.drop(columns='date', inplace=True)

    # Groupby everything but sum_click
    grouper = ['code_module', 'code_presentation', 'id_student', 'activity_type', 'bin']
    vle_df_grouped = vle_df.groupby(grouper)['sum_click'].sum().to_frame()
    vle_df_grouped.reset_index(inplace=True)


    return vle_df_grouped

In [30]:
def wide_form_vle():
    vle_df_long = create_vle_df()
    vle_df_wide = vle_df_long.pivot_table(
                values = 'sum_click',
                columns = ['activity_type', 'bin'],
                index = ['id_student', 'code_module', 'code_presentation']
    )

    vle_df_wide.columns = ["_".join(a) for a in vle_df_wide.columns.to_flat_index()]
    vle_df_wide.reset_index(inplace=True)

    return vle_df_wide


In [31]:
def create_si_sa_vle_df():
    si_sa_df = create_si_sa_df()
    vle = wide_form_vle()

    # Merge
    si_sa_vle_df = pd.merge(si_sa_df, vle, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return si_sa_vle_df

In [32]:
wide_df = create_si_sa_vle_df()

In [33]:
#uncomment to create xlsx on local machine if not pulling from github
#wide_df.to_excel("wideform.xlsx", index=False)