In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import datetime
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import math
from numpy.linalg import svd

from sklearn.ensemble import RandomForestClassifier

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


### Loading the initial dataset

In [2]:
path = os.getcwd() + '/../dataset/anonymized_dataset_for_ADM2017/'
print(path + 'student_log_{}.csv'.format(1))
log1 = pd.read_csv(path + 'student_log_{}.csv'.format(1))
initial_col_order = [col.lower() for col in list(log1.columns)]

data = pd.DataFrame()
for i in range(1, 10):
    student_log = pd.read_csv(path + 'student_log_{}.csv'.format(i))
    student_log.columns = [col.lower() for col in student_log]
    data = pd.concat([data, student_log], ignore_index=True)

# reorder everything with the original order, where student id is in the first column
data = data[initial_col_order]
data.shape
data.head()

/Users/yeldosbalgabekov/Desktop/Books and Schools/UoSouthampton/Data Mining/DataMining/Yeldos_Richa/../dataset/anonymized_dataset_for_ADM2017/student_log_1.csv


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,confidence(off task),confidence(gaming),res_bored,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln
0,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.83871,0.008522,0.376427,0.320317,0.0,0.0,0.785585,0.000264,0.13,0.0611904
1,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.6,0.047821,0.156027,0.995053,0.887452,0.0,0.468252,0.001483,0.0611904,0.21351
2,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.6,0.047821,0.156027,0.995053,0.887452,0.0,0.468252,0.001483,0.116,0.0333058
3,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.204082,0.343996,0.156027,0.74452,0.0,0.0,0.108417,0.010665,0.116,0.0333058
4,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.204082,0.343996,0.156027,0.74452,0.0,0.0,0.108417,0.010665,0.0333058,0.118386


### Transforming the initial dataset and loading Training Set

In [3]:
def to_float(x):
    try:
        if x.isdigit():
            return float(x)
    except:
        if x == '.':
            return 0
        return x

def series_types(series):
    types = []
    for i in set(series):
        if not type(i) in types:
            types.append(type(i))
    return(types)


data['sy assistments usage'] = data['sy assistments usage'].astype('category')
data['skill'] = data['skill'].astype('category')
data['problemtype'] = data['problemtype'].astype('category')
data['ln-1'] = data['ln-1'].apply(lambda x: to_float(x))
data['ln'] = data['ln'].apply(lambda x: to_float(x))
# data.memory_usage(deep=True)
# data.info(memory_usage='deep')

training = pd.read_csv(path + 'training_label.csv').drop(['AveCorrect'], axis=1)
training.columns = [col.lower() for col in training.columns]
training.shape

# merged data set with isSTEM
df = data.merge(training, on='itest_id', how="left")
#df.info()


### Filtering for the actions of the "training" students

In [8]:
actions = df[df.isstem.notnull()]
actions['skill'] = actions.skill.astype('category')
print("training set size: ", training.shape[0], "actions for # of students: ", len(actions.itest_id.unique()))
print("sample size useless for training: ", training.shape[0] - len(actions.itest_id.unique()))

# merged training data set
stem = actions[actions.isstem == 1]
nonstem = actions[actions.isstem == 0]

actions.head()

training set size:  514 actions for # of students:  326
sample size useless for training:  188


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln,schoolid,mcas,isstem
1056,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.262675,0.060808,0.0,0.889219,0.005797,0.271,0.077899,2.0,34.0,0.0
1057,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.696486,0.0,0.009561,0.108417,0.001483,0.077899,0.225856,2.0,34.0,0.0
1058,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.225856,0.483008,2.0,34.0,0.0
1059,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.483008,0.74529,2.0,34.0,0.0
1060,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.898073,0.0,0.009561,0.468252,0.001483,0.74529,0.900994,2.0,34.0,0.0


### Testing the cases
Analysing other features

In [53]:
stem, non_stem = actions[actions['isstem']==1], actions[actions['isstem']==0]
balanced = pd.concat([stem, non_stem.sample(stem.shape[0])])
balanced['sy assistments usage'] = balanced['sy assistments usage'].cat.codes
balanced['skill'] = balanced['skill'].cat.codes
balanced['problemtype'] = balanced['problemtype'].cat.codes
c1 = pd.get_dummies(balanced['sy assistments usage'], prefix=['yos'])
c2 = pd.get_dummies(balanced['skill'], prefix=['skill'])
c3 = pd.get_dummies(balanced['problemtype'], prefix=['problemtype'])
print(balanced.shape, c1.shape, c2.shape, c3.shape)
cats = pd.concat([c1, c2, c3], axis=1)
cats.shape, balanced.shape
table = pd.concat([balanced.iloc[:, -1], balanced.iloc[:, 0], balanced.iloc[:, 1:-1], cats], axis=1)
table.drop(columns=['sy assistments usage', 'skill', 'problemtype'])
print(table.shape)
table.head()

(93340, 79) (93340, 2) (93340, 93) (93340, 16)
(93340, 190)


Unnamed: 0,isstem,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,...,['problemtype']_6,['problemtype']_7,['problemtype']_8,['problemtype']_9,['problemtype']_10,['problemtype']_11,['problemtype']_12,['problemtype']_13,['problemtype']_14,['problemtype']_15
72563,1.0,1047,0,0.102428,0.062206,0.304348,506,0.205797,0.689716,0.08071,...,0,0,0,0,0,0,0,0,0,1
72564,1.0,1047,0,0.102428,0.062206,0.304348,506,0.205797,0.689716,0.08071,...,0,0,0,0,0,0,0,0,0,1
72565,1.0,1047,0,0.102428,0.062206,0.304348,506,0.205797,0.689716,0.08071,...,0,0,0,0,0,0,0,0,1,0
72566,1.0,1047,0,0.102428,0.062206,0.304348,506,0.205797,0.689716,0.08071,...,0,0,0,0,0,0,0,0,1,0
72567,1.0,1047,0,0.102428,0.062206,0.304348,506,0.205797,0.689716,0.08071,...,0,0,0,0,0,0,0,0,0,1


In [90]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

#TODO: normalize
table = table.fillna(0)
voc_size = 200
kmeans = KMeans(voc_size, 'k-means++', random_state=0).fit(table.iloc[:, 2:])
kmeans.labels_


array([  7,   7,   7, ...,  11,  71, 121], dtype=int32)

In [91]:
words = kmeans.labels_.reshape(table.shape[0], 1)

In [121]:
tmp = table.iloc[:, :2]
tmp = tmp.reset_index().iloc[:, 1:]
tmp2 = pd.DataFrame(words).reset_index().iloc[:,-1]
print(tmp.shape, tmp2.shape)


(93340, 2) (93340,)


In [137]:
with_words = pd.concat([tmp, tmp2], axis=1)
with_words.columns = ['isstem', 'itest_id', 'words']
with_words = with_words.groupby(['itest_id', 'words'], as_index=False).count()
his = with_words.pivot(index='itest_id', columns='words', values='isstem').fillna(0)

In [138]:
his.head()

words,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
itest_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35,0.0,0.0,36.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,0.0,0.0,18.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205,0.0,3.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,0.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
