In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from tsfresh import extract_features, select_features, extract_relevant_features
from tsfresh.utilities.dataframe_functions import impute

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [4]:
stu_vle = pd.read_csv('data/stvl_ccc14b.csv')
stu_as = pd.read_csv('data/stas_ccc14b.csv')
ass = pd.read_csv('data/ass_ccc14b.csv')
vle = pd.read_csv('data/vle_ccc14b.csv')
stu_info = pd.read_csv('data/stuinfo_ccc14b.csv')
stu_reg = pd.read_csv('data/stureg_ccc14b.csv')

# Preprocessing

### Drop early withdrawal students

Or rather, determine which students to keep.

In [5]:
stu_reg.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,CCC,2014B,28418,-37.0,
1,CCC,2014B,29764,-34.0,
2,CCC,2014B,29820,-57.0,
3,CCC,2014B,40333,-30.0,17.0
4,CCC,2014B,40604,-17.0,


In [6]:
pop_of_interest = stu_reg.drop(stu_reg[stu_reg.date_unregistration <= 67].index)['id_student'].unique()

In [7]:
len(stu_vle.drop(stu_vle[~stu_vle.id_student.isin(pop_of_interest)].index)['id_student'].unique()) # num students left in course who clicked

1300

In [12]:
clkd_pop_of_interest = stu_vle.drop(stu_vle[~stu_vle.id_student.isin(pop_of_interest)].index)['id_student'].unique()

### Make y (targets) column

In [10]:
stu_info.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,CCC,2014B,28418,F,West Midlands Region,A Level or Equivalent,20-30%,0-35,0,30,N,Fail
1,CCC,2014B,29764,M,East Anglian Region,A Level or Equivalent,50-60%,0-35,0,90,N,Distinction
2,CCC,2014B,29820,M,East Anglian Region,HE Qualification,40-50%,0-35,0,60,N,Pass
3,CCC,2014B,40333,M,North Region,HE Qualification,0-10%,35-55,0,30,N,Withdrawn
4,CCC,2014B,40604,M,Ireland,A Level or Equivalent,,35-55,0,30,N,Pass


In [13]:
y_col = stu_info.drop(stu_info[~stu_info.id_student.isin(clkd_pop_of_interest)].index)

In [14]:
y_col = y_col.drop(['code_module', 'code_presentation', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability'], axis=1)
y_col.final_result.replace(to_replace=dict(Pass=1, Distinction=1, Fail=0, Withdrawn=0), inplace=True)
y_col = y_col.sort_values(by=['id_student'])

In [15]:
y_col.head()

Unnamed: 0,id_student,final_result
0,28418,0
1,29764,1
2,29820,1
4,40604,1
5,42638,1


### Train-test split BEFORE feature-extraction

Very important or else this would perform significantly worse in production.

In [16]:
train, test = train_test_split(y_col, test_size=0.2)

In [19]:
len(train), len(test)

(1040, 260)

In [66]:
y_train = train['final_result'].values
y_test = test['final_result'].values

### Get students click stream data (from which to generate features)

In [28]:
train_students = train['id_student'].values #To-do: assert .values == .unique

In [29]:
test_students = test['id_student'].values

In [53]:
stvl_train = stu_vle[stu_vle.id_student.isin(train_students)]
stvl_test = stu_vle[stu_vle.id_student.isin(test_students)]

In [54]:
stvl_train = stvl_train.drop(['code_module', 'code_presentation', 'id_site'], axis=1)

In [55]:
stvl_train = stvl_train.groupby(['id_student', 'date']).sum() #sum daily clicks per student

In [56]:
stvl_train = stvl_train.reset_index()

In [57]:
stvl_test = stvl_test.drop(['code_module', 'code_presentation', 'id_site'], axis=1)
stvl_test = stvl_test.groupby(['id_student', 'date']).sum()
stvl_test = stvl_test.reset_index()

In [70]:
stvl_train.head()

Unnamed: 0,id_student,date,sum_click
0,28418,-5,1
1,28418,4,19
2,28418,11,52
3,28418,14,2
4,28418,15,5


# Time Series Feature Extraction

#### Note that this first run is without imputing the data first

In [60]:
extracted_features = extract_features(stvl_ts, column_id="id_student", column_sort="date")

Feature Extraction: 100%|██████████| 20/20 [00:46<00:00,  2.04s/it]


In [90]:
impute(extracted_features)
features_filtered = select_features(extracted_features, y)





In [92]:
features_filtered.columns

Index(['sum_click__count_below_mean',
       'sum_click__fft_aggregated__aggtype_"centroid"',
       'sum_click__range_count__max_1000000000000.0__min_0',
       'sum_click__length', 'sum_click__fft_aggregated__aggtype_"variance"',
       'sum_click__number_peaks__n_1', 'sum_click__number_peaks__n_3',
       'sum_click__cid_ce__normalize_True', 'sum_click__number_peaks__n_5',
       'sum_click__number_peaks__n_10',
       ...
       'sum_click__agg_linear_trend__f_agg_"min"__chunk_len_10__attr_"intercept"',
       'sum_click__fft_coefficient__coeff_4__attr_"angle"',
       'sum_click__fft_coefficient__coeff_25__attr_"abs"',
       'sum_click__index_mass_quantile__q_0.9',
       'sum_click__fft_coefficient__coeff_26__attr_"abs"',
       'sum_click__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_8__w_10',
       'sum_click__fft_coefficient__coeff_3__attr_"imag"',
       'sum_click__agg_linear_trend__f_agg_"max"__chunk_len_50__attr_"slope"',
       'sum_click__agg_linear_trend__f_agg_"mea

In [93]:
features_filtered.head()

variable,sum_click__count_below_mean,"sum_click__fft_aggregated__aggtype_""centroid""",sum_click__range_count__max_1000000000000.0__min_0,sum_click__length,"sum_click__fft_aggregated__aggtype_""variance""",sum_click__number_peaks__n_1,sum_click__number_peaks__n_3,sum_click__cid_ce__normalize_True,sum_click__number_peaks__n_5,sum_click__number_peaks__n_10,...,"sum_click__agg_linear_trend__f_agg_""min""__chunk_len_10__attr_""intercept""","sum_click__fft_coefficient__coeff_4__attr_""angle""","sum_click__fft_coefficient__coeff_25__attr_""abs""",sum_click__index_mass_quantile__q_0.9,"sum_click__fft_coefficient__coeff_26__attr_""abs""","sum_click__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_8__w_10","sum_click__fft_coefficient__coeff_3__attr_""imag""","sum_click__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""slope""","sum_click__agg_linear_trend__f_agg_""mean""__chunk_len_50__attr_""rvalue""","sum_click__fft_coefficient__coeff_20__attr_""abs"""
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28418,14.0,4.841238,22.0,22.0,14.731253,6.0,3.0,7.169995,1.0,0.0,...,1.0,-74.332052,188.102563,0.818182,200.753009,45.67616,-27.308763,-19.5,-0.409204,180.925846
29764,65.0,18.875191,89.0,89.0,195.902422,27.0,13.0,12.631683,5.0,2.0,...,1.533333,151.464895,432.905438,0.955056,55.950577,20.014289,289.893523,175.0,1.0,107.198076
29820,51.0,15.210297,71.0,71.0,136.989712,24.0,9.0,12.255684,8.0,3.0,...,1.166667,45.42007,272.938454,0.901408,183.493686,3.314806,78.30733,-66.0,-1.0,48.151843
40604,39.0,10.010927,53.0,53.0,62.300896,15.0,6.0,8.495901,3.0,2.0,...,1.380952,-111.471014,34.401666,0.716981,72.829687,11.046074,80.002907,-82.0,-1.0,88.076735
42638,132.0,37.899295,193.0,193.0,1009.906465,62.0,26.0,16.321515,10.0,8.0,...,3.728571,168.883418,564.108429,0.906736,249.45827,-5.925546,813.365024,67.5,0.891386,356.311709


# Random Forest run

In [94]:
X_train, X_test, y_train, y_test = train_test_split(features_filtered, y, random_state=21)

In [95]:
rf_clf = RandomForestClassifier(n_estimators = 50, 
                                max_depth    =  3,
                                random_state = 21)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=21, verbose=0,
                       warm_start=False)

In [96]:
rf_clf.score(X_test, y_test)

0.7876923076923077

In [97]:
xgb_clf = xgb.XGBClassifier(n_estimators    = 100, 
                              max_depth    =  5,
                              random_state = 21)
xgb_clf.fit(X_train, y_train)
xgb_clf.score(X_test, y_test)

0.7753846153846153