In [1]:
import pandas as pd

# Load datasets into DataFrames
df1 = pd.read_csv('ca1-dataset.csv')
df2 = pd.read_csv('ca2-dataset.csv')

# Check column names
print(df1.columns)
print(df2.columns)

# Check a few rows
print(df1.head())
print(df2.head())

df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()

# Merge datasets based on UniqueID
df_merged = pd.merge(df1, df2, on='Unique-id', how='inner')

df_merged.to_csv('merged_dataset.csv', index=False)

Index(['Unique-id', 'namea', 'OffTask', 'Avgright', 'Avgbug', 'Avghelp',
       'Avgchoice', 'Avgstring', 'Avgnumber', 'Avgpoint', 'Avgpchange',
       'Avgtime', 'AvgtimeSDnormed', 'Avgtimelast3SDnormed',
       'Avgtimelast5SDnormed', 'Avgnotright', 'Avghowmanywrong-up',
       'Avghelppct-up', 'Avgwrongpct-up', 'Avgtimeperact-up',
       'AvgPrev3Count-up', 'AvgPrev5Count-up', 'Avgrecent8help',
       'Avg recent5wrong', 'Avgmanywrong-up', 'AvgasymptoteA-up',
       'AvgasymptoteB-up'],
      dtype='object')
Index(['Row', 'lesson', 'namea', 'prod', 'cell', 'right', 'bug', 'help',
       'choice', 'string', 'number', 'point', 'pknow-1', 'Pknow-2', 'pchange',
       'time', 'timeSDnormed', 'timelast3SDnormed', 'timelast5SDnormed',
       'notright', 'howmanywrong-up', 'helppct-up', 'wrongpct-up',
       'timeperact-up', 'Prev3Count-up', 'Prev5Count-up', 'recent8help',
       ' recent5wrong', 'manywrong-up', 'asymptoteA-up', 'asymptoteB-up',
       'Behaviour', 'Coder', 'Unique-id'],
 

In [2]:
#Feature 1
#Average Time taken to complete the actions that the students got correct
#this is a great way to understand how long it took a student to understand and execute the questions they are getting correct
df_merged['avgTimeCorrectAction'] = df_merged['Avgtime'] / df_merged['Avgright']
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,Prev3Count-up,Prev5Count-up,recent8help,recent5wrong,manywrong-up,asymptoteA-up,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,ON TASK,awagner,12.0
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,ON TASK,awagner,12.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,ON TASK,awagner,7.5
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,ON TASK,awagner,7.5
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,ON TASK,awagner,25.333333


In [3]:
#Feature 2
#The percent of recent actions that students have gotten wrong
#Can help us understand how much students are answering recent questions incorrectly to understand better how to help them going forward
df_merged['percentageRecent5Wrong'] = df_merged['Avg recent5wrong'] / 5
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,Prev5Count-up,recent8help,recent5wrong,manywrong-up,asymptoteA-up,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,ON TASK,awagner,12.0,0.2
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,ON TASK,awagner,12.0,0.2
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,ON TASK,awagner,7.5,0.0
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,ON TASK,awagner,7.5,0.0
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,ON TASK,awagner,25.333333,0.2


In [4]:
#Feature 3
#Change in average time per action.
#helps understand how long the time the student is spending on these actions is changing to help us better understand if students are getting faster or slower
df_merged['timePerActionChange'] = df_merged['Avgtime'] - df_merged['Avgtimeperact-up']
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,recent8help,recent5wrong,manywrong-up,asymptoteA-up,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,1,0,0,0,ON TASK,awagner,12.0,0.2,0.0
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,1,0,0,0,ON TASK,awagner,12.0,0.2,0.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,ON TASK,awagner,7.5,0.0,-1.25
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,ON TASK,awagner,7.5,0.0,-1.25
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,0,1,0,0,0,ON TASK,awagner,25.333333,0.2,-0.75


In [5]:
#Feature 4
#score based on correctness, time, and standard deviations. 
#Helps us better understand if the actions have been efficient
df_merged['efficiencyScore'] = (df_merged['Avgright'] * df_merged['AvgtimeSDnormed']) / (df_merged['Avgbug'] + df_merged['Avgpchange'])
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,recent5wrong,manywrong-up,asymptoteA-up,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,1,0,0,0,ON TASK,awagner,12.0,0.2,0.0,inf
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,1,0,0,0,ON TASK,awagner,12.0,0.2,0.0,inf
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,ON TASK,awagner,7.5,0.0,-1.25,-inf
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,0,ON TASK,awagner,7.5,0.0,-1.25,-inf
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,1,0,0,0,ON TASK,awagner,25.333333,0.2,-0.75,inf


In [6]:
#Feature 5
#Change in knowledge or proficiency from the previous time point.
#Can help us understand if students are growing, stagnating, or having more difficulty in the actions
df_merged['changeInKnowledge'] = df_merged['Pknow-2'] - df_merged['pknow-1'].shift(1)
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,manywrong-up,asymptoteA-up,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore,changeInKnowledge
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,0,ON TASK,awagner,12.0,0.2,0.0,inf,
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,0,ON TASK,awagner,12.0,0.2,0.0,inf,0.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,ON TASK,awagner,7.5,0.0,-1.25,-inf,0.100178
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,0,ON TASK,awagner,7.5,0.0,-1.25,-inf,-0.100178
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,0,0,0,ON TASK,awagner,25.333333,0.2,-0.75,inf,0.091419


In [7]:
#Feature 6
#Difference between the average correctness of bug-related actions and right actions.
#can show patterns in students' abilities to fix their actions versus getting them correctly on the first try
df_merged['avgBugVsAvgRight'] = df_merged['Avgbug'] - df_merged['Avgright']
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,asymptoteA-up,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore,changeInKnowledge,avgBugVsAvgRight
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,ON TASK,awagner,12.0,0.2,0.0,inf,,-1.0
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,0,ON TASK,awagner,12.0,0.2,0.0,inf,0.0,-1.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,ON TASK,awagner,7.5,0.0,-1.25,-inf,0.100178,-1.0
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,0,ON TASK,awagner,7.5,0.0,-1.25,-inf,-0.100178,-1.0
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,0,0,ON TASK,awagner,25.333333,0.2,-0.75,inf,0.091419,-1.0


In [8]:
#Feature 7
#rate at which knowledge or proficiency is changing over time.
#this can help us evaluate if students' knowledge is moving in the positive direction to indicate growth in understanding
df_merged['knowledgeGrowthRate'] = df_merged['Pknow-2'].diff()
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,asymptoteB-up,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore,changeInKnowledge,avgBugVsAvgRight,knowledgeGrowthRate
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,ON TASK,awagner,12.0,0.2,0.0,inf,,-1.0,
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,0,ON TASK,awagner,12.0,0.2,0.0,inf,0.0,-1.0,0.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,ON TASK,awagner,7.5,0.0,-1.25,-inf,0.100178,-1.0,0.100178
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,0,ON TASK,awagner,7.5,0.0,-1.25,-inf,-0.100178,-1.0,-0.100178
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,0,ON TASK,awagner,25.333333,0.2,-0.75,inf,0.091419,-1.0,0.091419


In [9]:
#Feature 8
#average change in time per action compared to the previous action
#helps us understand if students are getting faster or slower during the actions
df_merged['totalWrongActions'] = df_merged['Avgwrongpct-up'] * df_merged['Avgmanywrong-up']
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,Behaviour,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore,changeInKnowledge,avgBugVsAvgRight,knowledgeGrowthRate,totalWrongActions
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,ON TASK,awagner,12.0,0.2,0.0,inf,,-1.0,,0.0
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,ON TASK,awagner,12.0,0.2,0.0,inf,0.0,-1.0,0.0,0.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,ON TASK,awagner,7.5,0.0,-1.25,-inf,0.100178,-1.0,0.100178,0.0
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,ON TASK,awagner,7.5,0.0,-1.25,-inf,-0.100178,-1.0,-0.100178,0.0
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,ON TASK,awagner,25.333333,0.2,-0.75,inf,0.091419,-1.0,0.091419,0.0


In [10]:
#Feature 9
#average duration of all actions, including right and wrong actions
#can help with understanding if the amount of time a student takes to complete an action is related to their understanding of the task and if they need more guidance. Alos helps with understanding the efficiency of their solution processes.
df_merged['avgActionDuration'] = df_merged[['Avgtime', 'AvgtimeSDnormed']].mean(axis=1)
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,Coder,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore,changeInKnowledge,avgBugVsAvgRight,knowledgeGrowthRate,totalWrongActions,avgActionDuration
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,awagner,12.0,0.2,0.0,inf,,-1.0,,0.0,6.223049
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,awagner,12.0,0.2,0.0,inf,0.0,-1.0,0.0,0.0,6.223049
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,awagner,7.5,0.0,-1.25,-inf,0.100178,-1.0,0.100178,0.0,3.619522
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,awagner,7.5,0.0,-1.25,-inf,-0.100178,-1.0,-0.100178,0.0,3.619522
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,awagner,25.333333,0.2,-0.75,inf,0.091419,-1.0,0.091419,0.0,12.817454


In [11]:
#Feature 10
#what is the change in recent correctness to better understand if students are gradually getting more questions correctly based on the help
df_merged['recentCorrectnessChange'] = (df_merged['Avgrecent8help'] - df_merged['Avg recent5wrong']).diff()
df_merged.head()

Unnamed: 0,Unique-id,namea_x,OffTask,Avgright,Avgbug,Avghelp,Avgchoice,Avgstring,Avgnumber,Avgpoint,...,avgTimeCorrectAction,percentageRecent5Wrong,timePerActionChange,efficiencyScore,changeInKnowledge,avgBugVsAvgRight,knowledgeGrowthRate,totalWrongActions,avgActionDuration,recentCorrectnessChange
0,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,12.0,0.2,0.0,inf,,-1.0,,0.0,6.223049,
1,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZgy46jl,N,1.0,0.0,0,0,0,0,0,...,12.0,0.2,0.0,inf,0.0,-1.0,0.0,0.0,6.223049,0.0
2,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,7.5,0.0,-1.25,-inf,0.100178,-1.0,0.100178,0.0,3.619522,1.0
3,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ77be0l,N,1.0,0.0,0,0,0,0,0,...,7.5,0.0,-1.25,-inf,-0.100178,-1.0,-0.100178,0.0,3.619522,0.0
4,awagner-closeloop-ins_h1zaz4-03.30.2011_at_13:...,stuZ5lp7k7,N,1.0,0.0,0,0,0,0,0,...,25.333333,0.2,-0.75,inf,0.091419,-1.0,0.091419,0.0,12.817454,-1.0


In [12]:
df_merged.to_csv('merged_dataset_with_new_features.csv', index=False)

In [13]:
#Correlation Matrix
import pandas as pd

# Load your data
data = pd.read_csv("merged_dataset_with_new_features.csv")

selected_columns = ['avgTimeCorrectAction', 'percentageRecent5Wrong', 'timePerActionChange', 'efficiencyScore', 'changeInKnowledge', 
                    'avgBugVsAvgRight','knowledgeGrowthRate', 'totalWrongActions', 'avgActionDuration', 'recentCorrectnessChange']

# Creating a correlation matrix
correlation_matrix = df_merged[selected_columns].corr()


# Displaying the correlation matrix
print(correlation_matrix)

                         avgTimeCorrectAction  percentageRecent5Wrong  \
avgTimeCorrectAction                 1.000000                0.183175   
percentageRecent5Wrong               0.183175                1.000000   
timePerActionChange                  0.501297                0.047954   
efficiencyScore                      0.657994                0.086458   
changeInKnowledge                    0.025678                0.246711   
avgBugVsAvgRight                     0.401175                0.478361   
knowledgeGrowthRate                 -0.057908               -0.042008   
totalWrongActions                    0.148279                0.464405   
avgActionDuration                    0.841789                0.056887   
recentCorrectnessChange             -0.071465               -0.470678   

                         timePerActionChange  efficiencyScore  \
avgTimeCorrectAction                0.501297         0.657994   
percentageRecent5Wrong              0.047954         0.086458   
t

In [14]:
#Merging Original Dataset with New Features
import pandas as pd

# Selecting the features to be moved to a new spreadsheet
selected_columns_to_move = ["Unique-id", "namea_x", "OffTask", "Avgright", "Avgbug", "Avghelp", "Avgchoice", "Avgstring", 
                    "Avgnumber", "Avgpoint", "Avgpchange", "Avgtime", "AvgtimeSDnormed", 
                    "Avgtimelast3SDnormed", "Avgtimelast5SDnormed", "Avgnotright", "Avghowmanywrong-up", 
                    "Avghelppct-up", "Avgwrongpct-up", "Avgtimeperact-up", "AvgPrev3Count-up", "AvgPrev5Count-up", 
                    "Avgrecent8help", "Avg recent5wrong", "Avgmanywrong-up", "AvgasymptoteA-up", "AvgasymptoteB-up",
                    'avgTimeCorrectAction', 'percentageRecent5Wrong', 'timePerActionChange', 'efficiencyScore', 'changeInKnowledge', 
                    'avgBugVsAvgRight', 'totalWrongActions', 'avgActionDuration', 'recentCorrectnessChange',
                    'help', 'Pknow-2', 'pknow-1', 'time']

# Creating a new DataFrame with the selected columns
selected_features_df = df_merged[selected_columns_to_move]

# Specifying the path for the new spreadsheet
selected_features_df.to_csv('original_dataset_with_new_features.csv', index=False)


In [15]:
#Baseline for Random Forest Classifier w/student-level cross-validation
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold

# Load the data
data = pd.read_csv("ca1-dataset.csv")

# Encode the target variable
le = LabelEncoder()
data['OffTask'] = data['OffTask'].replace({'Y': 1, 'N': 0})
X = data.drop(columns=['OffTask', 'Unique-id', 'namea'], axis=1)
y = data.OffTask

X[np.isinf(X)] = np.nan

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define student groups for cross-validation
group_dict = {}
groups = np.array([])
for index, row in data.iterrows():
    student_id = row['namea']
    if student_id not in group_dict:
        group_dict[student_id] = index
    groups = np.append(groups, group_dict[student_id])

# Initialize the GroupKFold splitter
gkf = GroupKFold(n_splits=10)

# Initialize and train the classifier within the cross-validation loop
kappa_values = []
for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and train the classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_values.append(kappa)
    
accuracy = accuracy_score(y_test, y_pred)
classification_report_values = classification_report(y_test, y_pred)
kappa_score = cohen_kappa_score(y_test, y_pred)
f1_value = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
    
average_kappa = sum(kappa_values) / len(kappa_values)
print("Average RFC Cohen's Kappa:", average_kappa)
print("RFC Accuracy:", accuracy)
print("RFC F1 Score", f1_value)
print("RFC Precision", precision)

Average RFC Cohen's Kappa: 0.16832338133707997
RFC Accuracy: 0.9342105263157895
RFC F1 Score 0.2857142857142857
RFC Precision 1.0


In [16]:
#Cross-validated Random Tree Classifier w/ student-level cross-validation
import pandas as pd
import numpy as np 
from sklearn.model_selection import cross_validate, GroupKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, cohen_kappa_score, f1_score, precision_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

# Load the data
data = pd.read_csv("original_dataset_with_new_features.csv")

# Encode the target variable
le = LabelEncoder()
data['OffTask'] = data['OffTask'].replace({'Y': 1, 'N': 0})
X = data.drop(columns=['OffTask', 'Unique-id', 'namea_x'], axis=1)
y = data.OffTask

X[np.isinf(X)] = np.nan

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Define student groups for cross-validation
group_list = []
for index, row in data.iterrows():
    student_id = row['namea_x']
    if student_id not in group_dict:
        group_dict[student_id] = index
    group_list.append(group_dict[student_id])

groups = np.array(group_list)

# Initialize the GroupKFold splitter
gkf = GroupKFold(n_splits=5) 

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define a custom scorer for cross-validation based on cohen_kappa_score
kappa_scorer = make_scorer(cohen_kappa_score)

# Perform Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(clf, param_grid, cv=gkf.split(X, y, groups=groups), scoring=kappa_scorer)
grid_search.fit(X, y)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator from the grid search for cross-validation
best_clf = grid_search.best_estimator_

# Perform cross-validation using cross_validate
cv_results = cross_validate(best_clf, X, y, cv=gkf.split(X, y, groups=groups),
                            scoring={'kappa': kappa_scorer, 'accuracy': 'accuracy', 'f1': 'f1', 'precision': 'precision'},
                            return_train_score=False)

# Print cross-validation results
print("RFC Cross-validated Cohen's Kappa:", np.mean(cv_results['test_kappa']))
print("RFC Cross-validated Accuracy:", np.mean(cv_results['test_accuracy']))
print("RFC Cross-validated F1 Score:", np.mean(cv_results['test_f1']))
print("RFC Cross-validated Precision:", np.mean(cv_results['test_precision']))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
RFC Cross-validated Cohen's Kappa: 0.24667528609645437
RFC Cross-validated Accuracy: 0.980514748788855
RFC Cross-validated F1 Score: 0.2524808524808525
RFC Cross-validated Precision: 0.7


In [17]:
#Baseline for Decision Forest Classifier w/student-level cross-validation
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv("ca1-dataset.csv")

# Encode the target variable
le = LabelEncoder()
data['OffTask'] = data['OffTask'].replace({'Y': 1, 'N': 0})
X = data.drop(columns=['OffTask', 'Unique-id', 'namea'], axis=1)
y = data.OffTask

X[np.isinf(X)] = np.nan

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Define student groups for cross-validation
group_dict = {}
groups = np.array([])
for index, row in data.iterrows():
    student_id = row['namea']
    if student_id not in group_dict:
        group_dict[student_id] = index
    groups = np.append(groups, group_dict[student_id])

# Initialize the GroupKFold splitter
gkf = GroupKFold(n_splits=10)

# Initialize and train the classifier within the cross-validation loop
kappa_values = []
for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and train the classifier
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_values.append(kappa)

# Evaluate the model
average_kappa = sum(kappa_values) / len(kappa_values)
accuracy = accuracy_score(y_test, y_pred)
classification_report_values = classification_report(y_test, y_pred)
f1_value = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print("DTC Cohen's Kappa:", average_kappa)
print("DTC Accuracy:", accuracy)
print("DTC F1 Score", f1_value)
print("DTC Precision", precision)

DTC Cohen's Kappa: 0.17948479177734683
DTC Accuracy: 0.8552631578947368
DTC F1 Score 0.15384615384615383
DTC Precision 0.14285714285714285


In [18]:
#Baseline for Cross-validated Decision Forest Classifier w/student-level cross-validation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, cohen_kappa_score, f1_score, precision_score
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv("original_dataset_with_new_features.csv")

# Encode the target variable
le = LabelEncoder()
data['OffTask'] = le.fit_transform(data['OffTask'])

X = data.drop(columns=['OffTask', 'Unique-id', 'namea_x'], axis=1)
y = data['OffTask']

# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define student groups for cross-validation
group_dict = {}
group_list = []
for index, row in data.iterrows():
    student_id = row['namea_x']
    if student_id not in group_dict:
        group_dict[student_id] = index
    group_list.append(group_dict[student_id])

groups = np.array(group_list)

# Initialize the GroupKFold splitter
gkf = GroupKFold(n_splits=5)

# Initialize the classifier
clf = DecisionTreeClassifier(random_state=42)

# Perform Grid Search
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the kappa scorer
kappa_scorer = make_scorer(cohen_kappa_score)

grid_search = GridSearchCV(clf, param_grid, cv=gkf.split(X, y, groups=groups), scoring=kappa_scorer)
grid_search.fit(X, y)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator from the grid search for cross-validation
best_clf = grid_search.best_estimator_

# Perform cross-validation using cross_validate
cv_results = cross_validate(best_clf, X, y, cv=gkf.split(X, y, groups=groups),
                            scoring={'kappa': kappa_scorer, 'accuracy': 'accuracy', 'f1': 'f1', 'precision': 'precision'},
                            return_train_score=False)

# Print cross-validation results
print("DTC Cross-validated Cohen's Kappa:", np.mean(cv_results['test_kappa']))
print("DTC Cross-validated Accuracy:", np.mean(cv_results['test_accuracy']))
print("DTC Cross-validated F1 Score:", np.mean(cv_results['test_f1']))
print("DTC Cross-validated Precision:", np.mean(cv_results['test_precision']))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
DTC Cross-validated Cohen's Kappa: 0.2376223471127772
DTC Cross-validated Accuracy: 0.9656132081551604
DTC Cross-validated F1 Score: 0.2515262515262515
DTC Cross-validated Precision: 0.463859649122807
