## Final Test

In [203]:
import pandas as pd

X_train_transformed = pd.read_csv('../../data/X_train_transformed.csv')
X_test_transformed = pd.read_csv('../../data/X_test_transformed.csv')
#X_val_transformed = pd.read_csv('../../data/X_val_transformed.csv')
#X_train_pca = pd.read_csv('../../data/X_train_pca.csv')
#X_test_pca = pd.read_csv('../../data/X_test_pca.csv')
#X_val_pca = pd.read_csv('../../data/X_val_pca.csv')


y_train = pd.read_csv('../../data/y_train.csv')
y_test = pd.read_csv('../../data/y_test.csv')
#y_val = pd.read_csv('../../data/y_val.csv')

In [204]:
# copy the data
y_train_binary = y_train.copy()

# map values to 'intervene' and 'no_intervene'
y_train_binary.replace({'Pass': 'no_intervene', 'Distinction': 'no_intervene',
                        'Withdrawn': 'intervene', 'Fail': 'intervene'}, inplace=True)

y_test_binary = y_test.copy()
y_test_binary.replace({'Pass': 'no_intervene', 'Distinction': 'no_intervene',
                        'Withdrawn': 'intervene', 'Fail': 'intervene'}, inplace=True)


In [205]:
import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt

In [206]:
zipfile_ou = '../../data/anonymisedData.zip'
#prediction_point = 50

In [207]:


def load_data(zip_file_path):
    '''Loads the data from the Open University Learning Analytics dataset zip file.'''
    
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        registrations = pd.read_csv(zip_file.open('studentRegistration.csv'))
        courses = pd.read_csv(zip_file.open('courses.csv'))
        students = pd.read_csv(zip_file.open('studentInfo.csv'))
        student_vle = pd.read_csv(zip_file.open('studentVle.csv'))
        vle = pd.read_csv(zip_file.open('vle.csv'))
        student_assessments = pd.read_csv(zip_file.open('studentAssessment.csv'))
        assessments = pd.read_csv(zip_file.open('assessments.csv'))
    
    return registrations, courses, students, student_vle, vle, student_assessments, assessments


In [208]:
registrations, courses, students, student_vle, vle, student_assessments, assessments = load_data(zipfile_ou)

In [209]:
prediction_point = 150

In [210]:
def student_data(students, courses, registrations, prediction_point=None):
    '''Returns a dataframe of student data based on `students`: 

    prediction_point is an integer representing the number of days from the start of the course for which you want to predict the outcome.

    default is max(student_regist['module_presentation_length']) - that is, the last da of the lengthiest course

    - merge students and courses
    - merge registrations
    - drop missing value rows (date_registration, imd_band)
    - drop students who unregistered before registering
    - fills missing date_unregistration with module_presentation_length - that is, assumes they completed the course
    - removes students who withdrew or failed before prediction point
    '''

    # merge students and courses
    student_regist = pd.merge(students, courses, on=['code_module', 'code_presentation'], validate='many_to_one')

    # merge registrations
    student_regist = pd.merge(student_regist, registrations, on=['code_module', 'code_presentation', 'id_student'], how = 'left', validate='1:1')

    # drop missing value rows (date_registration, imd_band)
    student_regist.dropna(subset=['date_registration', 'imd_band'], inplace=True)

    # set default status to 'keep'
    student_regist['status'] = 'keep'

    # prediction point must be less than course length, integer, and greater than 0
    if prediction_point is not None:
        assert isinstance(prediction_point, int) and prediction_point > 0 and prediction_point <= (max(student_regist['module_presentation_length']) + 7), "Error: Invalid prediction point.\n\nPlease provide an integer value greater than 0 and less than or equal to the maximum course length.\n\nThis is the number of days from the start of the course for which you want to predict the outcome."
    
        # withdrawn or failed before prediction point - remove
        withdrawn_fail_condition = (student_regist['final_result'].isin(['Withdrawn', 'Fail'])) & (student_regist['date_unregistration'] <= prediction_point)
        student_regist.loc[withdrawn_fail_condition, 'status'] = 'remove_outcome_known'
    
        # unregister after prediction point - keep
        unregister_after_condition = student_regist['date_unregistration'] > prediction_point
        student_regist.loc[unregister_after_condition, 'status'] = 'keep'
    
        # if no unregistration date - keep
        no_unregistration_condition = student_regist['date_unregistration'].isna()
        student_regist.loc[no_unregistration_condition, 'status'] = 'keep'
    
        # query case
        student_regist.loc[~(withdrawn_fail_condition | unregister_after_condition | no_unregistration_condition), 'status'] = 'query'


    # rows which need investigation
    query_rows = student_regist[student_regist['status'] == 'query'] | student_regist[student_regist['status'].isna()]

    # print rows which need investigation
    if not query_rows.empty:
        print("The following rows need investigation.  They are excluded from the following analysis: \n")
        print(query_rows)
        student_regist = student_regist[~student_regist.isin(query_rows)].dropna()

    # replace missing date_unreg with module_presentation_length
    student_regist['date_unregistration'] = student_regist['date_unregistration'].fillna(student_regist['module_presentation_length'])

    # drop students who unregistered before starting
    student_regist = student_regist[student_regist['date_unregistration'] >= student_regist['date_registration']]

    # remove rows from final student df which are not needed
    model_final = student_regist[student_regist['status'] != 'remove_outcome_known']
   

    return model_final



In [211]:
model_final = student_data(students, courses, registrations, prediction_point)

In [212]:
model_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23122 entries, 0 to 32592
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 23122 non-null  object 
 1   code_presentation           23122 non-null  object 
 2   id_student                  23122 non-null  int64  
 3   gender                      23122 non-null  object 
 4   region                      23122 non-null  object 
 5   highest_education           23122 non-null  object 
 6   imd_band                    23122 non-null  object 
 7   age_band                    23122 non-null  object 
 8   num_of_prev_attempts        23122 non-null  int64  
 9   studied_credits             23122 non-null  int64  
 10  disability                  23122 non-null  object 
 11  final_result                23122 non-null  object 
 12  module_presentation_length  23122 non-null  int64  
 13  date_registration           231

In [213]:
def add_assessments(student_assessments, assessments, courses, model_final, prediction_point=None):
    '''Returns updated model_final dataframe with student assessment data added:

    prediction_point is an integer representing the number of days from the start of the course for which you want to predict the outcome.

    default is max(student_regist['module_presentation_length']) - that is, the last da of the lengthiest course

    - merge 'assessments' and 'courses' and 'student_assessments'
    - populate missing 'date' values with course final week - as suggested in literature
    - remove students from 'model_final' who have no score for an assessment
    - remove students with odd assessment dates (before registration, after unregistration, or very far into future)
    - remove students who withdrew or failed before prediction point
    - calculate new features - average score, submission date distance, proportion of assessments submitted
    '''
    # merge 'assessments' and 'courses' on 'code_module' and 'code_presentation'
    course_assess = pd.merge(assessments,courses, on=['code_module', 'code_presentation'], how='left')

    # fill in the missing 'date' values with course final week (as per literature)
    value_to_fill = course_assess['module_presentation_length'] - 3
    course_assess['date'] = course_assess['date'].fillna(value_to_fill)

    # merge student_assessments with course_assess
    stu_assess = pd.merge(student_assessments, course_assess, on=['id_assessment'], how='left')

    # drop no score for an assessment
    missing_score_rows = stu_assess[stu_assess['score'].isna()]

    if not missing_score_rows.empty:
        print("\n\nThe following students have missing 'scores'. Their are excluded from the analysis: \n")
        print(missing_score_rows)

        # drop rows with missing score
        stu_assess.dropna(subset=['score'], inplace=True)

    # remove students date_submitted values 21 days after course end date
    max_module_length = stu_assess['module_presentation_length'].max() +21
    
    stu_assess = stu_assess[stu_assess['date_submitted'] <= max_module_length]


    
    # if prediction_point is None
    if prediction_point is None:
        # no data reduction
        model_student_assess = stu_assess
        model_course_assess = course_assess
    else:
        # prediction point must be less than course length, integer, and greater than 0
        assert isinstance(prediction_point, int) and prediction_point > 0 and prediction_point <= (max(courses['module_presentation_length'])+7), "Error: Invalid prediction point.\n\nPlease provide an integer value greater than 0 and less than or equal to the maximum course length.\n\nThis is the number of days from the start of the course for which you want to predict the outcome."

        # reduce data by prediction point
        model_student_assess = stu_assess[stu_assess['date'] <= prediction_point]
        model_course_assess = course_assess[course_assess['date'] <= prediction_point]

    # expected assessment details
    expected_ass = model_course_assess.groupby(['code_module', 'code_presentation'])['id_assessment'].count().reset_index()
    expected_ass = expected_ass.rename(columns={'id_assessment': 'exp_sub_count'})
    date_sum = model_course_assess.groupby(['code_module', 'code_presentation'])['date'].sum().reset_index()
    expected_ass = expected_ass.merge(date_sum, on=['code_module', 'code_presentation'], how='left')
    expected_ass = expected_ass.rename(columns={'date': 'exp_sub_date_sum'})

    # summarise students' assessments by module_presentation
    student_assessment_summary = model_student_assess.groupby(['id_student', 'code_module', 'code_presentation']).agg(
        count_id_assessment=('id_assessment', 'count'),
        sum_score=('score', 'sum'),
        sum_date=('date', 'sum')
    ).reset_index()

    # merge student_assessment_summary and expected_ass on code_module and code_presentation
    merged_assess_summary = student_assessment_summary.merge(expected_ass, on=['code_module', 'code_presentation'])

    # calculate the new features
    merged_assess_summary['prop_submissions'] = merged_assess_summary['count_id_assessment'] / merged_assess_summary['exp_sub_count']
    merged_assess_summary['avg_score'] = merged_assess_summary['sum_score'] / merged_assess_summary['exp_sub_count']
    merged_assess_summary['submission_distance'] = merged_assess_summary['exp_sub_date_sum'] - merged_assess_summary['sum_date']

    # merge with 'model_final' - inner (loses no engagement students)
    #model_final = model_final.merge(merged_assess_summary, on=['id_student', 'code_module', 'code_presentation'], how='inner')

    # merge with 'model_final' - left (keeps students with no engagement - NaN need updating to 0 in tidy query)
    model_final = model_final.merge(merged_assess_summary, on=['id_student', 'code_module', 'code_presentation'], how='left')

  

    return model_final

In [214]:
model_final = add_assessments(student_assessments, assessments, courses, model_final, prediction_point)



The following students have missing 'scores'. Their are excluded from the analysis: 

        id_assessment  id_student  date_submitted  is_banked  score  \
215              1752      721259              22          0    NaN   
937              1754      260355             127          0    NaN   
2364             1760     2606802             180          0    NaN   
3358            14984      186780              77          0    NaN   
3914            14984      531205              26          0    NaN   
...               ...         ...             ...        ...    ...   
148929          34903      582670             241          0    NaN   
159251          37415      610738              87          0    NaN   
166390          37427      631786             221          0    NaN   
169725          37435      648110              62          0    NaN   
170103          37435      480914              49          0    NaN   

       code_module code_presentation assessment_type   date

In [215]:
model_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23122 entries, 0 to 23121
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 23122 non-null  object 
 1   code_presentation           23122 non-null  object 
 2   id_student                  23122 non-null  int64  
 3   gender                      23122 non-null  object 
 4   region                      23122 non-null  object 
 5   highest_education           23122 non-null  object 
 6   imd_band                    23122 non-null  object 
 7   age_band                    23122 non-null  object 
 8   num_of_prev_attempts        23122 non-null  int64  
 9   studied_credits             23122 non-null  int64  
 10  disability                  23122 non-null  object 
 11  final_result                23122 non-null  object 
 12  module_presentation_length  23122 non-null  int64  
 13  date_registration           231

In [216]:
def add_vle_data(model_final, student_vle, vle, courses, prediction_point=None):
    '''Returns updated model_final dataframe with vle data:

    prediction_point is an integer representing the number of days from the start of the course for which you want to predict the outcome.

    default is max(student_regist['module_presentation_length']) - that is, the last da of the lengthiest course

    - merge 'vle' and 'courses' and 'student_vle'
    - remove columns (week_from, week_to)
    - filter the rows where 'date' <= 'prediction_point'
    - create new features: vle_activity_count, student vle type count, student total clicks, student days active (engaged)
    
    '''
    # merge 'vle' and 'courses' on 'code_module' and 'code_presentation'
    course_vle = vle.merge(courses, on=['code_module', 'code_presentation'], how='left').drop(['week_from', 'week_to'], axis=1)

    # merge vle with student_vle
    all_stu_vle = pd.merge(student_vle, course_vle, on=['id_site', 'code_module', 'code_presentation'], how='left')

        
    # if prediction_point is None
    if prediction_point is None:
        # no filtering
        all_stu_vle = all_stu_vle
    else:
        # prediction point must be less than or equal to the maximum date
        assert isinstance(prediction_point, int) and prediction_point <= max(all_stu_vle['date']), "Error: Invalid prediction point.\n\nPlease provide an integer value less than or equal to the maximum date.\n\nThis is the cutoff date for filtering the rows."

        # filter the rows where 'date' <= 'prediction_point'
        all_stu_vle = all_stu_vle[all_stu_vle['date'] <= prediction_point]


    # filter the rows where 'date' is greater than 'module_presentation_length'
    vle_after_done = all_stu_vle[all_stu_vle['date'] > all_stu_vle['module_presentation_length']]

    if not vle_after_done.empty:
        print("The following rows need investigation. They are excluded from the following analysis: \n")
        print(vle_after_done)

        # match rows based on 'code_module', 'code_presentation', and 'id_student'
        matching_rows = model_final[model_final[['code_module', 'code_presentation', 'id_student']].isin(vle_after_done).all(axis=1)]

        # remove the matching rows from 'model_final'
        model_final = model_final[~model_final.index.isin(matching_rows.index)]

    # aggregations for each column
    aggregations = {
        'id_site': 'count',
        'activity_type': 'nunique',
        'sum_click': 'sum',
        'date': lambda x: x.nunique()
    }

    # group and apply the aggregations
    grouped_stu_vle = all_stu_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(aggregations).reset_index()

    # rename the columns
    grouped_stu_vle.rename(columns={
        'id_site': 'stu_activity_count',
        'activity_type': 'stu_activity_type_count',
        'sum_click': 'stu_total_clicks',
        'date': 'stu_days_active'
    }, inplace=True)

    # number of vle types per module_presentation
    group_vle = vle.groupby(['code_module', 'code_presentation'])['activity_type'].nunique().reset_index()
    group_vle.rename(columns={'activity_type': 'mod_pres_vle_type_count'}, inplace=True)

    # merge 'grouped_stu_vle' and 'group_vle' on 'code_module' and 'code_presentation'
    merged_vle_summary = grouped_stu_vle.merge(group_vle, on=['code_module', 'code_presentation'], how='left')

    # merge with 'model_final' - inner join - loses students without vle engagement
    #model_final = model_final.merge(merged_vle_summary, on=['id_student', 'code_module', 'code_presentation'], how='inner')

    # merge with 'model_final' - left join - keeps students without vle engagement, NaN nees updating to 0 in tidy function
    model_final = model_final.merge(merged_vle_summary, on=['id_student', 'code_module', 'code_presentation'], how='left')

    return model_final


In [217]:
model_final = add_vle_data(model_final, student_vle, vle, courses, prediction_point)

In [218]:
model_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23122 entries, 0 to 23121
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 23122 non-null  object 
 1   code_presentation           23122 non-null  object 
 2   id_student                  23122 non-null  int64  
 3   gender                      23122 non-null  object 
 4   region                      23122 non-null  object 
 5   highest_education           23122 non-null  object 
 6   imd_band                    23122 non-null  object 
 7   age_band                    23122 non-null  object 
 8   num_of_prev_attempts        23122 non-null  int64  
 9   studied_credits             23122 non-null  int64  
 10  disability                  23122 non-null  object 
 11  final_result                23122 non-null  object 
 12  module_presentation_length  23122 non-null  int64  
 13  date_registration           231

In [219]:
def tidy_up_model_final_binary(model_final):
    '''Returns updated model_final dataframe:
    - deleted unnecessary columns
    - reordered columns
    - added subject, year, month columns
    
    '''
    
    # module subject mapping
    code_module_mapping = {
        'AAA': 'SocSci',
        'BBB': 'SocSci',
        'GGG': 'SocSci',
        'CCC': 'Stem',
        'DDD': 'Stem',
        'EEE': 'Stem',
        'FFF': 'Stem'
    }
    model_final['subject'] = model_final['code_module'].map(code_module_mapping)

    # move 'final_result' to the last column
    final_result_column = model_final.pop('final_result')
    model_final['final_result'] = final_result_column

    # drop columns
    model_final.drop(columns=['code_module', 'code_presentation', 'id_student','gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'date_registration', 'date_unregistration', 'count_id_assessment', 'sum_score', 'sum_date', 'exp_sub_count', 'exp_sub_date_sum', 'status', 'mod_pres_vle_type_count', 'module_presentation_length'], inplace=True)
    
    # replace NaN values with 0 - these are students who did not engage (vle, assessment)
    model_final.fillna(0, inplace=True)

    

    return model_final


In [220]:
tidy_up_model_final_binary(model_final)

Unnamed: 0,num_of_prev_attempts,studied_credits,prop_submissions,avg_score,submission_distance,stu_activity_count,stu_activity_type_count,stu_total_clicks,stu_days_active,subject,final_result
0,0,240,1.0,81.000000,0.0,133.0,6.0,710.0,29.0,SocSci,Pass
1,0,60,1.0,69.333333,0.0,313.0,7.0,1010.0,56.0,SocSci,Pass
2,0,60,1.0,72.333333,0.0,445.0,8.0,1499.0,83.0,SocSci,Pass
3,0,60,1.0,54.000000,0.0,242.0,7.0,825.0,50.0,SocSci,Pass
4,0,60,1.0,74.000000,0.0,404.0,7.0,1391.0,81.0,SocSci,Pass
...,...,...,...,...,...,...,...,...,...,...,...
23117,0,30,1.0,82.500000,0.0,83.0,6.0,182.0,17.0,SocSci,Distinction
23118,0,30,0.0,0.000000,0.0,19.0,4.0,41.0,5.0,SocSci,Fail
23119,0,30,1.0,80.500000,0.0,93.0,7.0,304.0,18.0,SocSci,Distinction
23120,0,30,1.0,70.000000,0.0,61.0,6.0,163.0,13.0,SocSci,Pass


In [221]:
model_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23122 entries, 0 to 23121
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   num_of_prev_attempts     23122 non-null  int64  
 1   studied_credits          23122 non-null  int64  
 2   prop_submissions         23122 non-null  float64
 3   avg_score                23122 non-null  float64
 4   submission_distance      23122 non-null  float64
 5   stu_activity_count       23122 non-null  float64
 6   stu_activity_type_count  23122 non-null  float64
 7   stu_total_clicks         23122 non-null  float64
 8   stu_days_active          23122 non-null  float64
 9   subject                  23122 non-null  object 
 10  final_result             23122 non-null  object 
dtypes: float64(7), int64(2), object(2)
memory usage: 2.1+ MB


In [222]:
from sklearn.model_selection import train_test_split

# drop target from X, save target to y
X = model_final.drop('final_result', axis=1)  
y = model_final['final_result']  

# split data into train-test and validation sets with stratification
X_train_test, X_val, y_train_test, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=567)

# split train-test set into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_train_test, y_train_test, test_size=0.25, stratify=y_train_test, random_state=567)



In [223]:
# proportions of target variable in original data
original_proportions = y.value_counts(normalize=True)

# proportions of target variable in train, validation, and test sets
train_proportions = y_train.value_counts(normalize=True)
val_proportions = y_val.value_counts(normalize=True)
test_proportions = y_test.value_counts(normalize=True)

# results
print("Original Proportions:")
print(original_proportions)

print("\nTrain Set Proportions:")
print(train_proportions)

print("\nValidation Set Proportions:")
print(val_proportions)

print("\nTest Set Proportions:")
print(test_proportions)


Original Proportions:
Pass           0.511591
Fail           0.298158
Distinction    0.122178
Withdrawn      0.068074
Name: final_result, dtype: float64

Train Set Proportions:
Pass           0.511606
Fail           0.298155
Distinction    0.122189
Withdrawn      0.068051
Name: final_result, dtype: float64

Validation Set Proportions:
Pass           0.511568
Fail           0.298162
Distinction    0.122162
Withdrawn      0.068108
Name: final_result, dtype: float64

Test Set Proportions:
Pass           0.511568
Fail           0.298162
Distinction    0.122162
Withdrawn      0.068108
Name: final_result, dtype: float64


In [224]:
# missing values in X_train, X_test, y_train, y_test
missing_values_X_train = X_train.isnull().sum()
missing_values_X_test = X_test.isnull().sum()
missing_values_y_train = y_train.isnull().sum()
missing_values_y_test = y_test.isnull().sum()
missing_values_X_val = X_val.isnull().sum()
missing_values_y_val = y_val.isnull().sum()


# rows with missing values
rows_with_missing_X_train = X_train[X_train.isnull().any(axis=1)]
rows_with_missing_X_test = X_test[X_test.isnull().any(axis=1)]
rows_with_missing_y_train = y_train[y_train.isnull()]
rows_with_missing_y_test = y_test[y_test.isnull()]
rows_with_missing_X_val = X_val[X_val.isnull().any(axis=1)]
rows_with_missing_y_val = y_val[y_val.isnull()]



# results
print("Missing values in X_train:", len(rows_with_missing_X_train))
print("Missing values in X_test:", len(rows_with_missing_X_test))
print("Missing values in y_train:", len(rows_with_missing_y_train))
print("Missing values in y_test:", len(rows_with_missing_y_test))
print("Missing values in X_val:", len(rows_with_missing_X_val))
print("Missing values in y_val:", len(rows_with_missing_y_val))



Missing values in X_train: 0
Missing values in X_test: 0
Missing values in y_train: 0
Missing values in y_test: 0
Missing values in X_val: 0
Missing values in y_val: 0


In [225]:
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numeric_columns = X_train.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

print("Numeric Columns:")
print(numeric_columns)
print("\n")
print("Non-Numeric Columns:")
print(non_numeric_columns)


Numeric Columns:
['num_of_prev_attempts', 'studied_credits', 'prop_submissions', 'avg_score', 'submission_distance', 'stu_activity_count', 'stu_activity_type_count', 'stu_total_clicks', 'stu_days_active']


Non-Numeric Columns:
['subject']


In [226]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [227]:
nominal_cols = ['subject']

# One-Hot Encoding
X_train_nominal_encoded = pd.get_dummies(X_train[nominal_cols])
X_test_nominal_encoded = pd.get_dummies(X_test[nominal_cols])
X_val_nominal_encoded = pd.get_dummies(X_val[nominal_cols])

# reset indices
X_train_nominal_encoded.reset_index(drop=True, inplace=True)
X_test_nominal_encoded.reset_index(drop=True, inplace=True)
X_val_nominal_encoded.reset_index(drop=True, inplace=True)

print("Shape of X_train_nominal_encoded:", X_train_nominal_encoded.shape)
print("Shape of X_test_nominal_encoded:", X_test_nominal_encoded.shape)
print("Shape of X_val_nominal_encoded:", X_val_nominal_encoded.shape)


Shape of X_train_nominal_encoded: (13872, 2)
Shape of X_test_nominal_encoded: (4625, 2)
Shape of X_val_nominal_encoded: (4625, 2)


In [228]:

# standard Scaling
X_train_numeric = X_train[numeric_columns]
X_test_numeric = X_test[numeric_columns]
X_val_numeric = X_val[numeric_columns]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)
X_val_scaled = scaler.transform(X_val_numeric)

# reset indices 
X_train_scaled_reset = pd.DataFrame(X_train_scaled, columns=numeric_columns).reset_index(drop=True)
X_test_scaled_reset = pd.DataFrame(X_test_scaled, columns=numeric_columns).reset_index(drop=True)
X_val_scaled_reset = pd.DataFrame(X_val_scaled, columns=numeric_columns).reset_index(drop=True)

# concatenate merged nominal dataframes with scaled dataframes
X_train_transformed = pd.concat([X_train_nominal_encoded, X_train_scaled_reset], axis=1)
X_test_transformed = pd.concat([X_test_nominal_encoded, X_test_scaled_reset], axis=1)
X_val_transformed = pd.concat([X_val_nominal_encoded, X_val_scaled_reset], axis=1)

# merging all dataframes
print("Shape of X_train_transformed:", X_train_transformed.shape)
print("Shape of X_test_transformed:", X_test_transformed.shape)
print("Shape of X_val_transformed:", X_val_transformed.shape)


Shape of X_train_transformed: (13872, 11)
Shape of X_test_transformed: (4625, 11)
Shape of X_val_transformed: (4625, 11)


In [229]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13872 entries, 0 to 13871
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   subject_SocSci           13872 non-null  uint8  
 1   subject_Stem             13872 non-null  uint8  
 2   num_of_prev_attempts     13872 non-null  float64
 3   studied_credits          13872 non-null  float64
 4   prop_submissions         13872 non-null  float64
 5   avg_score                13872 non-null  float64
 6   submission_distance      13872 non-null  float64
 7   stu_activity_count       13872 non-null  float64
 8   stu_activity_type_count  13872 non-null  float64
 9   stu_total_clicks         13872 non-null  float64
 10  stu_days_active          13872 non-null  float64
dtypes: float64(9), uint8(2)
memory usage: 1002.6 KB


In [230]:
# copy the data
y_train_binary = y_train.copy()

# map values to 'intervene' and 'no_intervene'
y_train_binary.replace({'Pass': 'no_intervene', 'Distinction': 'no_intervene',
                        'Withdrawn': 'intervene', 'Fail': 'intervene'}, inplace=True)

y_test_binary = y_test.copy()
y_test_binary.replace({'Pass': 'no_intervene', 'Distinction': 'no_intervene',
                        'Withdrawn': 'intervene', 'Fail': 'intervene'}, inplace=True)

In [231]:
y_test_binary.to_csv('y_test_binary_150.csv', index=False)
X_test_transformed.to_csv('X_test_transformed_150.csv', index=False)

In [233]:
print(type(y_test_binary))

<class 'pandas.core.series.Series'>


In [234]:
print(type(X_test_transformed))

<class 'pandas.core.frame.DataFrame'>
