In [2]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import datetime

### Loading the initial dataset

In [3]:
path = os.getcwd() + '/../dataset/anonymized_dataset_for_ADM2017/'
print(path + 'student_log_{}.csv'.format(1))
log1 = pd.read_csv(path + 'student_log_{}.csv'.format(1))
initial_col_order = [col.lower() for col in list(log1.columns)]

data = pd.DataFrame()
for i in range(1, 10):
    student_log = pd.read_csv(path + 'student_log_{}.csv'.format(i))
    student_log.columns = [col.lower() for col in student_log]
    data = pd.concat([data, student_log], ignore_index=True)

# reorder everything with the original order, where student id is in the first column
data = data[initial_col_order]
data.shape
data.head()

/Users/yeldosbalgabekov/Desktop/Books and Schools/UoSouthampton/Data Mining/DataMining/Yeldos_Richa/../dataset/anonymized_dataset_for_ADM2017/student_log_1.csv


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,confidence(off task),confidence(gaming),res_bored,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln
0,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.83871,0.008522,0.376427,0.320317,0.0,0.0,0.785585,0.000264,0.13,0.0611904
1,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.6,0.047821,0.156027,0.995053,0.887452,0.0,0.468252,0.001483,0.0611904,0.21351
2,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.6,0.047821,0.156027,0.995053,0.887452,0.0,0.468252,0.001483,0.116,0.0333058
3,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.204082,0.343996,0.156027,0.74452,0.0,0.0,0.108417,0.010665,0.116,0.0333058
4,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.204082,0.343996,0.156027,0.74452,0.0,0.0,0.108417,0.010665,0.0333058,0.118386


### Transforming the initial dataset and loading Training Set

In [4]:
def to_float(x):
    try:
        if x.isdigit():
            return float(x)
    except:
        if x == '.':
            return 0
        return x

def series_types(series):
    types = []
    for i in set(series):
        if not type(i) in types:
            types.append(type(i))
    return(types)


data['sy assistments usage'] = data['sy assistments usage'].astype('category')
data['skill'] = data['skill'].astype('category')
data['problemtype'] = data['problemtype'].astype('category')
data['ln-1'] = data['ln-1'].apply(lambda x: to_float(x))
data['ln'] = data['ln'].apply(lambda x: to_float(x))
# data.memory_usage(deep=True)
# data.info(memory_usage='deep')

training = pd.read_csv(path + 'training_label.csv').drop(['AveCorrect'], axis=1)
training.columns = [col.lower() for col in training.columns]
training.shape

# merged data set with isSTEM
df = data.merge(training, on='itest_id', how="left")
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 639898 entries, 0 to 639897
Data columns (total 79 columns):
itest_id                                  639898 non-null int64
sy assistments usage                      639898 non-null object
aveknow                                   639898 non-null float64
avecarelessness                           639898 non-null float64
avecorrect                                639898 non-null float64
numactions                                639898 non-null int64
averesbored                               639898 non-null float64
averesengcon                              639898 non-null float64
averesconf                                639898 non-null float64
averesfrust                               639898 non-null float64
averesofftask                             639898 non-null float64
averesgaming                              639898 non-null float64
actionid                                  639898 non-null int64
skill                                 

### Filtering for the actions of the "training" students

In [5]:
actions = df[df.isstem.notnull()]
actions['skill'] = actions.skill.astype('category')
print("training set size: ", training.shape[0], "actions for # of students: ", len(actions.itest_id.unique()))
print("sample size useless for training: ", training.shape[0] - len(actions.itest_id.unique()))

# merged training data set
stem = actions[actions.isstem == 1]
nonstem = actions[actions.isstem == 0]

actions.head()

training set size:  514 actions for # of students:  326
sample size useless for training:  188


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln,schoolid,mcas,isstem
1056,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.262675,0.060808,0.0,0.889219,0.005797,0.271,0.077899,2.0,34.0,0.0
1057,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.696486,0.0,0.009561,0.108417,0.001483,0.077899,0.225856,2.0,34.0,0.0
1058,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.225856,0.483008,2.0,34.0,0.0
1059,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.483008,0.74529,2.0,34.0,0.0
1060,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.898073,0.0,0.009561,0.468252,0.001483,0.74529,0.900994,2.0,34.0,0.0


### Testing the cases
Top Skill Accuracy

#### Creating temporary subset removing unnecessary data

In [39]:
actions.columns

Index(['itest_id', 'sy assistments usage', 'aveknow', 'avecarelessness',
       'avecorrect', 'numactions', 'averesbored', 'averesengcon', 'averesconf',
       'averesfrust', 'averesofftask', 'averesgaming', 'actionid', 'skill',
       'problemid', 'assignmentid', 'assistmentid', 'starttime', 'endtime',
       'timetaken', 'correct', 'original', 'hint', 'hintcount', 'hinttotal',
       'scaffold', 'bottomhint', 'attemptcount', 'problemtype',
       'frishelprequest', 'frpast5helprequest', 'frpast8helprequest',
       'stlhintused', 'past8bottomout', 'totalfrpercentpastwrong',
       'totalfrpastwrongcount', 'frpast5wrongcount', 'frpast8wrongcount',
       'totalfrtimeonskill', 'timesinceskill', 'frworkinginschool',
       'totalfrattempted', 'totalfrskillopportunities', 'responseisfillin',
       'responseischosen', 'endswithscaffolding', 'endswithautoscaffolding',
       'frtimetakenonscaffolding', 'frtotalskillopportunitiesscaffolding',
       'totalfrskillopportunitiesbyscaffolding'

In [40]:
cols = ['itest_id', 'aveknow', 'avecarelessness',
       'avecorrect', 'numactions', 'averesbored', 'averesengcon', 'averesconf',
       'averesfrust', 'averesofftask', 'averesgaming', 'correct', 'original', 'hint', 'hintcount', 'hinttotal',
       'scaffold', 'bottomhint', 'attemptcount',
       'frishelprequest', 'frpast5helprequest', 'frpast8helprequest',
       'stlhintused', 'past8bottomout', 'totalfrpercentpastwrong',
       'totalfrpastwrongcount', 'frpast5wrongcount', 'frpast8wrongcount',
       'totalfrtimeonskill', 'timesinceskill', 'frworkinginschool',
       'totalfrattempted', 'totalfrskillopportunities', 'responseisfillin',
       'responseischosen', 'endswithscaffolding', 'endswithautoscaffolding',
       'frtimetakenonscaffolding', 'frtotalskillopportunitiesscaffolding',
       'totalfrskillopportunitiesbyscaffolding', 'frishelprequestscaffolding',
       'timegreater5secprev2wrong', 'sumright', 'helpaccessunder2sec',
       'timegreater10secandnextactionright', 'consecutiveerrorsinrow',
       'sumtime3sdwhen3rowright', 'sumtimeperskill',
       'totaltimebypercentcorrectforskill', 'prev5count', 'timeover80',
       'manywrong', 'isstem']
hotTable = actions[cols]
#sns.heatmap(hotTable.corr()), plt.show()

d = {}
for c in cols:
    d[c] = hotTable.isstem.corr(hotTable[c])
d = pd.DataFrame(d, index=[0]).T.reset_index()
d.sort_values(by=0, ascending=False)

Unnamed: 0,index,0
28,isstem,1.0
3,aveknow,0.310649
2,avecorrect,0.302631
1,avecarelessness,0.300856
42,timegreater10secandnextactionright,0.113927
9,averesofftask,0.112886
12,correct,0.083227
4,averesbored,0.079366
32,original,0.060055
44,timeover80,0.042695


In [220]:
# filtering for the necessary col
tmp = actions[['itest_id', 'skill', 'problemid','correct']]
# covnverting sk# create a table of correctness by question (from actions_table to problems_table)ills into cat type
tmp['skill'] = tmp.skill.astype('category').cat.codes
tmp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,itest_id,skill,problemid,correct
1056,35,56,104051249,0
1057,35,56,104051250,1
1058,35,56,104051251,1
1059,35,56,104051252,1
1060,35,56,104051253,1


In [41]:
# compare the prev. model vs. the one without merging actions to problems
# use Knowledge, Carelessness, Correctness
