In [64]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

**Available functions:**

`create_student_df()`: Returns merged studentInfo and studentRegistration with chosen features.

`create_assessment_df()`: Returns merged assessments and studentAssessment with chosen features. Currently set to return the first two assessments from courses AAA-FFF.

`wide_form_sa()`: Takes the dataframe returned by `create_assessment_df()` and returns a wide-form variant.

`create_si_sa_df()`: Returns merged dataframes created by `create_student_df()` and `wide_form_sa()`.

`create_vle_df()`: Returns merged vle and studentVle with chosen features. Currently set to return everything on or before day 60 (including pre-course days), binned by 15 day intervals, from courses AAA-FFF.

`wide_form_vle()`: Takes the dataframe returned by `create_vle_df()` and returnsa  wide-form varient.

`create_si_sa_vle_df()`: Returns merged dataframes created by `create_si_sa_df()` and `wide_form_vle` for a full index-features dataframe


`assessment_namer(id_assessment)`: Helper function for `create_assessment_df()`

In [65]:
def create_student_df():
    # Read in .csv's to be joined
    studentInfo = pd.read_csv('dataset/studentInfo.csv')
    studentRegistration = pd.read_csv('dataset/studentRegistration.csv')

    # Drop any unnecessary columns (can be edited for feature extraction)
    si_drop_cols = ['gender', 'region', 'highest_education', 'imd_band', 'disability']
    si = studentInfo.drop(columns=si_drop_cols)

    # Merge, inner join
    student_df = pd.merge(si, studentRegistration, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return student_df


In [66]:
def assessment_namer(id_assessment):
    Assess_1s = [1752, 1758, 14984, 14996, 15008, 15020, 24282, 24291, 25334, 25348, 25355, 25362, 30709, 30714, 30719, 34860, 34873, 34886, 34899]
    Assess_2s = [1753, 1759, 14985, 14997, 15009, 15021, 24283, 24292, 25335, 25349, 25356, 25363, 30710, 30715, 30720, 34861, 34874, 34887, 34900]

    if id_assessment in Assess_1s:
        return "A1"
    elif id_assessment in Assess_2s:
        return "A2"
    else:
        return "Drop"


In [67]:
def create_assessment_df():
    # Read in .csv's to be joined
    assessments = pd.read_csv('dataset/assessments.csv')
    studentAssessment = pd.read_csv('dataset/studentAssessment.csv')

    # Drop any unnecessary columns
    sa_drop_cols = ['is_banked']
    sa = studentAssessment.drop(columns=sa_drop_cols)

    # Merge, left join
    assessment_df = pd.merge(sa, assessments, how='left', on='id_assessment')

    # Dropping 'assessment_type' == 'CMA' due to extreme course differences
    # Also dropping the final exam because we want to catch students BEFORE then
    a_filtered = assessment_df[assessment_df.assessment_type == 'TMA']

    # Mapping with assessment_namer
    a_filtered['assessment_name'] = a_filtered['id_assessment'].apply(assessment_namer)
    a_filtered = a_filtered[a_filtered.assessment_name != 'Drop']
    

    return a_filtered



In [68]:
def wide_form_sa():
    sa = create_assessment_df()

    # Combine date submitted and date into -/+
    sa['days_from_due'] = sa['date_submitted'] - sa['date']

    # Drop Assessment Type, Weight
    sa.drop(columns=['assessment_type', 'weight', 'date_submitted', 'date', 'id_assessment'], inplace=True)

    inds = ['id_student', 'code_module', 'code_presentation']
    vals = ['score', 'days_from_due']

    sa_wide = sa.pivot_table(
                values = vals,
                columns = 'assessment_name',
                index = inds
    )

    
    sa_wide.columns = ["_".join(a) for a in sa_wide.columns.to_flat_index()]
    sa_wide.reset_index(inplace=True)

    return sa_wide


In [69]:
def create_si_sa_df():
    si = create_student_df()
    sa = wide_form_sa()

    # Merge
    si_sa_df = pd.merge(si, sa, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return si_sa_df

In [70]:
def create_vle_df():
    studentVle = pd.read_csv('dataset/studentVle.csv')
    vle = pd.read_csv('dataset/vle.csv')

    # Dropping unused columns from raw
    vle.drop(columns=['week_from', 'week_to'], inplace=True)

    #Merge - this primarily associates 'id_site' with a more narrative description
    vle_merged = pd.merge(studentVle, vle, how='left', on=['code_module', 'code_presentation', 'id_site'])
    vle_merged.drop(columns='id_site', inplace=True)

    # Dropping GGG
    vle_no_ggg = vle_merged[vle_merged.code_module != 'GGG']

    # Dropping activity types we're less interested in
    # For now, I'm only using 'forumng' for testing purposes
    # EDIT 07/19/2022, running without this to make a biiiig xlsx so we can see what's most important
    #vle_subset = vle_no_ggg[vle_no_ggg.activity_type == 'forumng']
    vle_subset = vle_no_ggg

    # Dropping after certain date (Adjust as desired!)
    date_max = 60
    vle_df = vle_subset[vle_subset.date <= date_max]

    # Creating bin columns
    # Set bin parameters (Also adjust as desired!)
    bin_vals = [-15, 0, 15, 30, 45, 60]
    bin_labels = ['pre-0', '1-15', '16-30', '31-45', '46-60']
    vle_df['bin'] = pd.cut(vle_df['date'], bins=bin_vals, labels=bin_labels)
    vle_df.drop(columns='date', inplace=True)

    # Groupby everything but sum_click
    grouper = ['code_module', 'code_presentation', 'id_student', 'activity_type', 'bin']
    vle_df_grouped = vle_df.groupby(grouper)['sum_click'].sum().to_frame()
    vle_df_grouped.reset_index(inplace=True)


    return vle_df_grouped

In [71]:
def wide_form_vle():
    vle_df_long = create_vle_df()
    vle_df_wide = vle_df_long.pivot_table(
                values = 'sum_click',
                columns = ['activity_type', 'bin'],
                index = ['id_student', 'code_module', 'code_presentation']
    )

    vle_df_wide.columns = ["_".join(a) for a in vle_df_wide.columns.to_flat_index()]
    vle_df_wide.reset_index(inplace=True)

    return vle_df_wide


In [72]:
def create_si_sa_vle_df():
    si_sa_df = create_si_sa_df()
    vle = wide_form_vle()

    # Merge
    si_sa_vle_df = pd.merge(si_sa_df, vle, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return si_sa_vle_df

In [73]:
#wide_df = create_si_sa_vle_df()

In [74]:
#uncomment to create xlsx on local machine if not pulling from github
#wide_df.to_excel("wideform.xlsx", index=False)

In [75]:
def create_vle_df_v2(code_module,code_presentation):
    studentVle = pd.read_csv('dataset/studentVle.csv')
    vle = pd.read_csv('dataset/vle.csv')

    # Dropping unused columns from raw
    vle.drop(columns=['week_from', 'week_to'], inplace=True)

    #Merge - this primarily associates 'id_site' with a more narrative description
    vle_merged = pd.merge(studentVle, vle, how='left', on=['code_module', 'code_presentation', 'id_site'])
    vle_merged.drop(columns='id_site', inplace=True)

    # For this one, we're lumping all activity_types together.
    vle_merged.drop(columns='activity_type', inplace=True)

    # Only using AAA
    vle_aaa = vle_merged[vle_merged.code_module == code_module]
    #and only using 2014J
    vle_section = vle_aaa[vle_aaa.code_presentation == code_presentation]

    
    # Groupby everything but sum_click
    grouper = ['code_module', 'code_presentation', 'id_student', 'date']
    vle_df_grouped = vle_section.groupby(grouper)['sum_click'].sum().to_frame()
    vle_df_grouped.reset_index(inplace=True)


    return vle_df_grouped

In [79]:
def wide_form_sa_v2(code_module, code_presentation):
    sa = create_assessment_df()

    # Drop Assessment Type, Weight
    sa.drop(columns=['assessment_type', 'weight', 'date', 'id_assessment'], inplace=True)

    inds = ['id_student', 'code_module', 'code_presentation']
    vals = ['score', 'date_submitted']

    sa_wide = sa.pivot_table(
                values = vals,
                columns = 'assessment_name',
                index = inds
    )

    
    sa_wide.columns = ["_".join(a) for a in sa_wide.columns.to_flat_index()]
    sa_wide.reset_index(inplace=True)

    # Only using AAA
    sa_aaa = sa_wide[sa_wide.code_module == code_module]
    # And, just in case there are duplicates of id_student,
    sa_section = sa_aaa[sa_aaa.code_presentation == code_presentation]



    return sa_section

In [82]:
def add_submission_bins():
    full_df = create_si_sa_vle_df()

    modules = full_df['code_module'].unique()
    presentations = full_df['code_presentation'].unique()

    binned_df = pd.DataFrame(columns=['id_student','code_module', 'code_presentation', 'sum_click_pre_A1', 'sum_click_pre_A2'])

    for m in modules:
        for p in presentations:
            sa_df = wide_form_sa_v2(m, p)
            vle_df = create_vle_df_v2(m, p)

        # Dropping any students that didn't turn in one or more assignments
        sa_df.dropna(axis=0, how='any', inplace=True)

        A1_submit = sa_df.set_index('id_student').to_dict()['date_submitted_A1']
        A2_submit = sa_df.set_index('id_student').to_dict()['date_submitted_A2']

        vle_df['date_submitted_A1'] = vle_df['id_student'].map(A1_submit)
        vle_df['date_submitted_A2'] = vle_df['id_student'].map(A2_submit)


        vle_df['bin'] = 2
        vle_df.loc[vle_df['date'] < vle_df['date_submitted_A2'], 'bin'] = 'pre_A2'
        vle_df.loc[vle_df['date'] < vle_df['date_submitted_A1'], 'bin'] = 'pre_A1'

        vle_df.dropna(axis=0, how='any', inplace=True)
        before_A2 = vle_df[vle_df.bin != 2]

        before_A2.drop(columns=['date_submitted_A1', 'date_submitted_A2'], inplace=True)

        # Groupby
        grouper = ['code_module', 'code_presentation', 'id_student', 'bin']
        vle_df_grouped = before_A2.groupby(grouper)['sum_click'].sum().to_frame()
        vle_df_grouped.reset_index(inplace=True)

        inds = ['id_student', 'code_module', 'code_presentation']
        vals = ['sum_click']

        df_wide = vle_df_grouped.pivot_table(
                    values = vals,
                    columns = 'bin',
                    index = inds
        )

        df_wide.columns = ["_".join(a) for a in df_wide.columns.to_flat_index()]
        df_wide.reset_index(inplace=True)

        binned_df = pd.concat([binned_df, df_wide], ignore_index=True)

    even_fuller_df =  pd.merge(full_df, binned_df, how='inner', on=['code_module', 'code_presentation', 'id_student'])

    return even_fuller_df






In [83]:
# THIS TAKES 20 MINUTES

#really_big_df = add_submission_bins()
#really_big_df.head(5)


Unnamed: 0,code_module,code_presentation,id_student,age_band,num_of_prev_attempts,studied_credits,final_result,date_registration,date_unregistration,days_from_due_A1,...,subpage_16-30,subpage_31-45,subpage_46-60,url_pre-0,url_1-15,url_16-30,url_31-45,url_46-60,sum_click_pre_A1,sum_click_pre_A2
0,BBB,2014B,50069,0-35,0,90,Fail,-134.0,,-1.0,...,,,21.0,,2.0,,,5.0,54.0,99.0
1,BBB,2014B,52426,35-55,1,60,Fail,-32.0,,-3.0,...,2.0,,,,3.0,,,,26.0,16.0
2,BBB,2014B,55968,0-35,0,120,Fail,-29.0,,-2.0,...,,4.0,5.0,1.0,,,3.0,1.0,133.0,248.0
3,BBB,2014B,59725,35-55,0,120,Distinction,-29.0,,-3.0,...,4.0,,,6.0,4.0,,,,184.0,82.0
4,BBB,2014B,60416,0-35,1,60,Pass,-64.0,,-5.0,...,1.0,3.0,1.0,2.0,1.0,1.0,4.0,3.0,125.0,697.0


In [84]:
#really_big_df.to_excel("wideform.xlsx", index=False)