<a href="https://colab.research.google.com/github/unofficial-Jona/XAI-Group/blob/master/XAI_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Data

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [66]:
from zipfile import ZipFile
from tqdm import tqdm
import requests

def download_file_from_google_drive(id, destination):

    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

def get_confirm_token(response):

    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):

    CHUNK_SIZE = 32768
    # Total size of the downloaded zip file:
    total_size = 88397334

    pbar = tqdm(total=total_size, unit='iB', unit_scale=True)
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            # Filter out keep-alive new chunks:
            if chunk:
                f.write(chunk)
                pbar.set_description("Downloading dataset...")
                pbar.update(CHUNK_SIZE)
        pbar.set_description_str("Download completed...")


def download_dataset(dataset_path, destination):

    file_id = '1kLBu_CMGicN9IBI6Rrb61c_uef2SPMa2'
    download_file_from_google_drive(file_id, destination)

    with ZipFile(destination, 'r') as zipObj:
        # Extract all the contents of zip file in current directory:
        zipObj.extractall(dataset_path)
        print("Extraction completed...")



In [67]:
download_dataset('zip_file', 'full_dataset')

Download completed...: 88.5MiB [00:07, 11.4MiB/s]


Extraction completed...


In [68]:
ass = pd.read_csv("zip_file/anonymiseddata/assessments.csv")
courses = pd.read_csv("zip_file/anonymiseddata/courses.csv")
results = pd.read_csv("zip_file/anonymiseddata/studentAssessment.csv")
info = pd.read_csv("zip_file/anonymiseddata/studentInfo.csv")
vle = pd.read_csv("zip_file/anonymiseddata/studentVle.csv")
reg = pd.read_csv("zip_file/anonymiseddata/studentRegistration.csv")
materials = pd.read_csv("zip_file/anonymiseddata/vle.csv")

# pre-processing, based on kaggle

In [69]:
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

ass['id_assessment'] = ass['id_assessment'].astype(object)

ass[ass['assessment_type'] != 'Exam']\
.groupby(['code_module', 'code_presentation'])\
.agg(total_weight = ('weight',sum))
ass[(ass['assessment_type'] == 'TMA') & (ass['weight'] == 0)]

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
48,BBB,2014J,15020,TMA,19.0,0.0
182,GGG,2013J,37415,TMA,61.0,0.0
183,GGG,2013J,37416,TMA,124.0,0.0
184,GGG,2013J,37417,TMA,173.0,0.0
192,GGG,2014B,37425,TMA,61.0,0.0
193,GGG,2014B,37426,TMA,117.0,0.0
194,GGG,2014B,37427,TMA,166.0,0.0
202,GGG,2014J,37435,TMA,61.0,0.0
203,GGG,2014J,37436,TMA,124.0,0.0
204,GGG,2014J,37437,TMA,173.0,0.0


In [70]:
ass[ass['code_module'] == 'BBB']\
.groupby(['code_module','code_presentation', 'assessment_type'])\
.agg(weight_by_type = ('weight',sum))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,weight_by_type
code_module,code_presentation,assessment_type,Unnamed: 3_level_1
BBB,2013B,CMA,5.0
BBB,2013B,Exam,100.0
BBB,2013B,TMA,95.0
BBB,2013J,CMA,5.0
BBB,2013J,Exam,100.0
BBB,2013J,TMA,95.0
BBB,2014B,CMA,5.0
BBB,2014B,Exam,100.0
BBB,2014B,TMA,95.0
BBB,2014J,Exam,100.0


In [71]:
ass.loc[(ass.code_module=='GGG') & (ass.assessment_type=='TMA'),'weight'] = (100/3)
ass.loc[(ass.code_module=='GGG') & (ass.assessment_type=='CMA'),'weight'] = (0)

In [72]:
results['id_assessment'] = results['id_assessment'].astype(object)
results['id_student'] = results['id_student'].astype(object)

In [73]:
results.fillna(0, inplace=True)

In [74]:
reg['id_student'] = reg['id_student'].astype(object)

In [75]:
df1_IDs = reg['id_student'].unique()
df2_IDs = info['id_student'].unique()

In [76]:
info['id_student'] = info['id_student'].astype(object)

In [77]:
VLEmaterials = pd.merge(vle, materials, on=['code_module', 'code_presentation', 'id_site'], how='inner')
# Drop columns
VLEmaterials.drop(columns=['week_from', 'week_to', 'date'], inplace=True)

In [78]:
VLEmaterials\
.groupby(['code_module', 'code_presentation', 'id_student'])\
.agg(total_click = ("sum_click",sum))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_click
code_module,code_presentation,id_student,Unnamed: 3_level_1
AAA,2013J,11391,934
AAA,2013J,28400,1435
AAA,2013J,30268,281
AAA,2013J,31604,2158
AAA,2013J,32885,1034
...,...,...,...
GGG,2014J,2640965,41
GGG,2014J,2645731,893
GGG,2014J,2648187,312
GGG,2014J,2679821,275


In [79]:
total_click_per_student = VLEmaterials\
.groupby(['code_module', 'code_presentation', 'id_student'])\
.agg(total_click = ("sum_click",sum))\
.reset_index()

In [80]:
regCourses = pd.merge(reg, courses, on=['code_module', 'code_presentation'], how='inner')

In [81]:
regCoursesInfo = pd.merge(regCourses, info, on=['code_module', 'code_presentation', 'id_student'], how='inner')

In [82]:
# merge with an inner join
assResults = pd.merge(ass, results, on=['id_assessment'], how='inner')
# Rearrange column names
assResults = assResults[['id_student', 'code_module', 'code_presentation', 'id_assessment', 'assessment_type', 'date', 'date_submitted', 'weight', 'is_banked', 'score']]

In [83]:
scores = assResults

scores['weight*score'] = scores['weight']*scores['score']
# (b) Aggregate recorded weight*score per student
    # per module presentation
sum_scores = scores\
.groupby(['id_student', 'code_module', 'code_presentation'])\
.agg(weightByScore = ('weight*score', sum))\
.reset_index()
# (c) Calculate total recorded weight of module
# (c.i) Get total weight of modules
total_weight = ass\
.groupby(['code_module', 'code_presentation'])\
.agg(total_weight = ('weight', sum))\
.reset_index()
# (c.ii) Subtract 100 to account for missing exams
total_weight['total_weight'] = total_weight['total_weight']-100
# (c.iii) Mark module DDD as having 200 credits 
total_weight.loc[(total_weight.code_module == 'DDD'), 'total_weight'] = 200

### Calculate weighted score ###
# (a) Merge sum_scores and total_weight tables
score_weights = pd.merge(sum_scores, total_weight, on=['code_module', 'code_presentation'], how='inner')
# (b) Calculate weighted score
score_weights['weighted_score'] = score_weights['weightByScore'] / score_weights['total_weight']
# (c) Drop helper columns
score_weights.drop(columns=['weightByScore', 'total_weight'], inplace=True)

In [84]:
# Calculate the difference between the submission dates
lateSubmission = assResults.assign(submission_days=assResults['date_submitted']-assResults['date'])
# Make a column indicating if the submission was late or not 
lateSubmission = lateSubmission.assign(late_submission=lateSubmission['submission_days'] > 0)

In [85]:
total_late_per_student = lateSubmission\
.groupby(['id_student', 'code_module', 'code_presentation'])\
.agg(total_late_submission = ('late_submission', sum))\
.reset_index()

In [86]:
total_count_assessments = lateSubmission[['id_student', 'code_module', 'code_presentation', 'id_assessment']]\
.groupby(['id_student', 'code_module', 'code_presentation'])\
.size()\
.reset_index(name='total_assessments')

In [87]:
late_rate_per_student = pd.merge(total_late_per_student, total_count_assessments, on=['id_student', 'code_module', 'code_presentation'], how='inner')
# Make a new column with late submission rate
late_rate_per_student['late_rate'] = late_rate_per_student['total_late_submission'] / late_rate_per_student['total_assessments']
# Drop helper columns
late_rate_per_student.drop(columns=['total_late_submission', 'total_assessments'], inplace=True)

In [88]:
passRate = assResults
passRate = passRate.assign(fail=passRate['score'] < 40)

In [89]:
total_fails_per_student = passRate\
.groupby(['id_student', 'code_module', 'code_presentation'])\
.agg(total_fails = ("fail",sum))\
.reset_index()



In [90]:
fail_rate_per_student = pd.merge(total_fails_per_student, total_count_assessments, on=['id_student', 'code_module', 'code_presentation'], how='inner')
# Make a new column with late submission rate
fail_rate_per_student['fail_rate'] = fail_rate_per_student['total_fails'] / fail_rate_per_student['total_assessments']
# Drop helper columns
fail_rate_per_student.drop(columns=['total_fails', 'total_assessments'], inplace=True)

In [91]:
assessments = pd.merge(score_weights, late_rate_per_student, on=['id_student', 'code_module', 'code_presentation'], how='inner')
assessments = pd.merge(assessments, fail_rate_per_student, on=['id_student', 'code_module', 'code_presentation'], how='inner')

In [92]:
merged = pd.merge(regCoursesInfo, total_click_per_student, on=['id_student', 'code_module', 'code_presentation'], how='left')

In [93]:
merged = pd.merge(merged, assessments, on=['id_student', 'code_module', 'code_presentation'], how='left')

In [94]:
merged['final_result'].unique()

array(['Pass', 'Withdrawn', 'Fail', 'Distinction'], dtype=object)

In [95]:
merged.to_csv ('final_dataset', index = False, header=True)

# data split

In [96]:
df = pd.read_csv('final_dataset')
df

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration,module_presentation_length,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,total_click,weighted_score,late_rate,fail_rate
0,AAA,2013J,11391,-159.0,,268,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,934.0,82.400000,0.000000,0.0
1,AAA,2013J,28400,-53.0,,268,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,1435.0,65.400000,0.400000,0.0
2,AAA,2013J,30268,-92.0,12.0,268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,281.0,,,
3,AAA,2013J,31604,-52.0,,268,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,2158.0,76.300000,0.000000,0.0
4,AAA,2013J,32885,-176.0,,268,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,1034.0,55.000000,1.000000,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,-4.0,,269,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail,41.0,,,
32589,GGG,2014J,2645731,-23.0,,269,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction,893.0,77.666667,0.222222,0.0
32590,GGG,2014J,2648187,-129.0,,269,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass,312.0,70.000000,0.000000,0.0
32591,GGG,2014J,2679821,-49.0,101.0,269,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn,275.0,27.666667,0.000000,0.0


In [97]:
from sklearn import model_selection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
train, test = model_selection.train_test_split(df, test_size=0.2, random_state=42)

# data preprocessing



In [98]:
class PrepareDataset(BaseEstimator, TransformerMixin):
    def __init__(self, prepare_nn=False):
        self.prepare_nn = prepare_nn
    def fit(self, X):
        return self
    def transform(self, X):
        y = X['final_result']
        y = np.where(y == 'Distinction', 1, y)
        y = np.where(y == 'Pass', 1, y)
        y = np.where(y == 'Fail', 0, y)
        y = y.astype('bool')
        X = X.drop('final_result', axis=1)
        if self.prepare_nn:
            scaler = MinMaxScaler()
            X = scaler.fit_transform(X)
            return X, y
        else:
            return X, y



In [99]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)



In [102]:
data_prep = PrepareDataset()
X_train, y_train = data_prep.transform(train)
X_train.shape, y_train.shape

((26074, 18), (26074,))

In [103]:
X_train, y_train = sm.fit_resample(X_train, y_train)
X_train.shape, y_train.shape

ValueError: ignored

In [105]:
y_train

array([ True,  True,  True, ...,  True,  True,  True])

# model set up

## SVC

In [106]:
from sklearn.svm import SVC

clf_SVC = SVC(kernel='linear', probability=True)
clf_SVC.fit(X_train, y_train)

ValueError: ignored

## random forest

In [107]:
from sklearn.ensemble import RandomForestClassifier


clf_RanFor = RandomForestClassifier()
clf_RanFor.fit(X_train,y_train)

ValueError: ignored

## MLP

In [108]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score


clf_MLP = MLPClassifier(random_state=1, max_iter=300)
clf_MLP.fit(X_train, y_train)

ValueError: ignored

# assess accuracy (train data)


## SVC

In [109]:
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.model_selection import cross_val_predict

y_scores = cross_val_predict(clf_SVC, X_train, y_train, cv=3)

fpr, tpr, threshold = roc_curve(y_train, y_scores)
plt.plot(fpr,tpr)

ValueError: ignored

In [110]:
roc_auc_score(y_train, y_scores)

NameError: ignored

In [None]:
confusion_matrix(y_train, y_scores)

## random forest

In [None]:
y_scores = cross_val_predict(clf_RanFor, X_train, y_train, cv=3)

fpr, tpr, threshold = roc_curve(y_train, y_scores)
plt.plot(fpr, tpr)

In [None]:
roc_auc_score(y_train, y_scores)

In [None]:
confusion_matrix(y_train, y_scores)

## MLP

In [None]:
y_scores = cross_val_predict(clf_MLP, X_train, y_train, cv=3)

fpr, tpr, threshold = roc_curve(y_train, y_scores)
plt.plot(fpr, tpr)

In [None]:
roc_auc_score(y_train, y_scores)

In [None]:
confusion_matrix(y_train, y_scores)

# assess accuracy (test data)

In [None]:
X_test, y_test = data_prep.transform(test)
X_test, y_test = sm.fit_resample(X_test, y_test)

## SVC

In [None]:
y_scores = clf_SVC.predict(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_scores)
plt.plot(fpr, tpr)

In [None]:
roc_auc_score(y_test, y_scores)

In [None]:
confusion_matrix(y_test, y_scores)

## random forest


In [None]:
y_scores = clf_RanFor.predict(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_scores)
plt.plot(fpr, tpr)

In [None]:
roc_auc_score(y_test, y_scores)

In [None]:
confusion_matrix(y_test, y_scores)

## MLP

In [None]:
y_scores = clf_MLP.predict(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_scores)
plt.plot(fpr, tpr)

In [None]:
roc_auc_score(y_test, y_scores)

In [None]:
confusion_matrix(y_test, y_scores)

# XAI methods

In [None]:
! pip install lime
! pip install shap
! pip install pdpbox

## LIME

In [None]:
from lime import lime_tabular

def run_lime(classifier, sample):
  explainer = lime_tabular.LimeTabularExplainer(
        training_data=np.array(X_train),
        feature_names=df.drop('final_result', axis=1).columns,
        class_names=['Fail', 'Pass'],
        mode='classification'
    )
  exp = explainer.explain_instance(
        data_row=sample,
        predict_fn=classifier.predict_proba
    )
  return exp.show_in_notebook(show_table = True)
  

In [None]:
instance = X_test[15]
run_lime(clf_SVC, instance)

## SHAP

In [None]:
import shap

#We will use SHAP KernelExplainer to explain the model.
explainer = shap.KernelExplainer(model=clf_MLP.predict_proba, data=X_train[0:100,:])

#Next, we compute the SHAP values
shap_values= explainer.shap_values(X=X_test[0:50,:])

#Since is binary classification, len = 2
print(len(shap_values))
#(50,6) - 50 objects, 6 features
print(shap_values[0].shape)

#Explaining a single prediction for passing
shap.initjs()
plot = shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_test[0,:])
shap.save_html('plot_1_instances.html', plot)

#Explaining a single prediction for failing
plot = shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test[0,:])
shap.save_html('plot_2_instances.html', plot)

#Explaining predictions for passing for 50 instances of X_test
plot = shap.force_plot(explainer.expected_value[1], shap_values[1], X_test)
shap.save_html('plot_X_test_instances.html', plot)

#Shap summary plot
print(shap.summary_plot(shap_values, X_test))


## PDP

In [None]:
from pdpbox import pdp, get_dataset, info_plots


features = ['num_of_prev_attempts','weighted_grade','pass_rate','exam_score','date','sum_click']
pdp_goals = pdp.pdp_isolate(model=clf_MLP, dataset=df.drop('final_result', axis=1), model_features= features, feature='exam_score')

pdp.pdp_plot(pdp_goals, 'exam_score')
plt.show()