In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import LinearSVC



In [2]:
df = pd.read_excel('wideform.xlsx')

df.head(5)

Unnamed: 0,code_module,code_presentation,id_student,age_band,num_of_prev_attempts,studied_credits,final_result,date_registration,date_unregistration,days_from_due_A1,...,subpage_pre-0,subpage_1-15,subpage_16-30,subpage_31-45,subpage_46-60,url_pre-0,url_1-15,url_16-30,url_31-45,url_46-60
0,AAA,2013J,11391,55<=,0,240,Pass,-159.0,,-1.0,...,11.0,9.0,2.0,1.0,,,1.0,,,
1,AAA,2013J,28400,35-55,0,60,Pass,-53.0,,3.0,...,26.0,17.0,16.0,1.0,1.0,10.0,9.0,11.0,1.0,1.0
2,AAA,2013J,31604,35-55,0,60,Pass,-52.0,,-2.0,...,19.0,19.0,17.0,17.0,3.0,3.0,10.0,8.0,12.0,3.0
3,AAA,2013J,32885,0-35,0,60,Pass,-176.0,,7.0,...,17.0,3.0,8.0,,1.0,3.0,,3.0,,
4,AAA,2013J,38053,35-55,0,60,Pass,-110.0,,0.0,...,15.0,6.0,11.0,3.0,3.0,1.0,4.0,6.0,2.0,3.0


In [3]:
index_cols = ['code_module', 'code_presentation', 'id_student']
label = ['final_result']
# feature_cols is everything else
# I'm removing age_band for now, though so everything's a number.
# I'm also removing date_unregistration and days from due because
# those don't make sense to change NaNs to 0, so that'll take more thought
not_features = ['code_module', 'code_presentation', 'id_student', 'final_result', 'age_band', 'date_unregistration', 'days_from_due_A1', 'days_from_due_A2']
features = [col for col in df.columns if col not in not_features]

# Setting NaNs as 0, see caveats above
df.fillna(0, inplace=True)

# Set fail as 1, pass as 0
# Distinction currently counted as pass
# Withdrawl counted as fail
df.loc[df['final_result'] == "Fail", 'final_result'] = 1 
df.loc[df['final_result'] == "Withdrawn", 'final_result'] = 1 
df.loc[df['final_result'] == "Pass", 'final_result'] = 0
df.loc[df['final_result'] == "Distinction", 'final_result'] = 0

In [4]:
X_train, X_test, y_train, y_test= train_test_split(df[features],df[label], test_size=0.20)

In [5]:
pipe = Pipeline([('scaler', StandardScaler())])
pipe.fit(X_train, y_train)
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)


In [6]:
X_train

array([[ 1.74277414,  1.05422092, -0.00274749, ..., -0.18566096,
        -0.41712222, -0.34491373],
       [-0.34316504, -0.55489186, -1.28297281, ..., -0.34850482,
        -0.41712222, -0.34491373],
       [-0.34316504,  1.05422092, -0.8632268 , ..., -0.18566096,
        -0.41712222, -0.34491373],
       ...,
       [-0.34316504, -0.55489186,  0.35403661, ..., -0.51134868,
        -0.41712222, -0.34491373],
       [-0.34316504, -0.55489186, -0.25459509, ...,  0.30287062,
        -0.41712222, -0.1336548 ],
       [ 1.74277414,  1.05422092, -0.8842141 , ..., -0.34850482,
         0.51409016, -0.34491373]])

In [7]:
y_train = y_train.astype('int')

In [8]:
anova_filter = SelectKBest(f_classif, k=30) #Adjust K to see choice cutoffs
clf = LinearSVC()
anova_svm = make_pipeline(anova_filter, clf)
anova_svm.fit(X_train, y_train['final_result'])



Pipeline(steps=[('selectkbest', SelectKBest(k=30)), ('linearsvc', LinearSVC())])

In [9]:
boolean_mask = anova_svm[0].get_support()

In [10]:
chosen_features = [b for a, b in zip(boolean_mask, features) if a]

In [11]:
chosen_features

['num_of_prev_attempts',
 'studied_credits',
 'score_A1',
 'score_A2',
 'forumng_pre-0',
 'forumng_1-15',
 'forumng_16-30',
 'forumng_31-45',
 'forumng_46-60',
 'homepage_pre-0',
 'homepage_1-15',
 'homepage_16-30',
 'homepage_31-45',
 'homepage_46-60',
 'oucontent_pre-0',
 'oucontent_1-15',
 'oucontent_16-30',
 'oucontent_31-45',
 'oucontent_46-60',
 'ouwiki_16-30',
 'questionnaire_16-30',
 'questionnaire_31-45',
 'quiz_1-15',
 'quiz_31-45',
 'quiz_46-60',
 'subpage_pre-0',
 'subpage_1-15',
 'subpage_46-60',
 'url_31-45',
 'url_46-60']