In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import datetime
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import math
from numpy.linalg import svd

from sklearn.ensemble import RandomForestClassifier

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


### Loading the initial dataset

In [2]:
path = os.getcwd() + '/../dataset/anonymized_dataset_for_ADM2017/'
print(path + 'student_log_{}.csv'.format(1))
log1 = pd.read_csv(path + 'student_log_{}.csv'.format(1))
initial_col_order = [col.lower() for col in list(log1.columns)]

data = pd.DataFrame()
for i in range(1, 10):
    student_log = pd.read_csv(path + 'student_log_{}.csv'.format(i))
    student_log.columns = [col.lower() for col in student_log]
    data = pd.concat([data, student_log], ignore_index=True)

# reorder everything with the original order, where student id is in the first column
data = data[initial_col_order]
data.shape
data.head()

/Users/yeldosbalgabekov/Desktop/Books and Schools/UoSouthampton/Data Mining/DataMining/Yeldos_Richa/../dataset/anonymized_dataset_for_ADM2017/student_log_1.csv


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,confidence(off task),confidence(gaming),res_bored,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln
0,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.83871,0.008522,0.376427,0.320317,0.0,0.0,0.785585,0.000264,0.13,0.0611904
1,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.6,0.047821,0.156027,0.995053,0.887452,0.0,0.468252,0.001483,0.0611904,0.21351
2,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.6,0.047821,0.156027,0.995053,0.887452,0.0,0.468252,0.001483,0.116,0.0333058
3,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.204082,0.343996,0.156027,0.74452,0.0,0.0,0.108417,0.010665,0.116,0.0333058
4,8,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,...,0.204082,0.343996,0.156027,0.74452,0.0,0.0,0.108417,0.010665,0.0333058,0.118386


### Transforming the initial dataset and loading Training Set

In [3]:
def to_float(x):
    try:
        if x.isdigit():
            return float(x)
    except:
        if x == '.':
            return 0
        return x

def series_types(series):
    types = []
    for i in set(series):
        if not type(i) in types:
            types.append(type(i))
    return(types)


data['sy assistments usage'] = data['sy assistments usage'].astype('category')
data['skill'] = data['skill'].astype('category')
data['problemtype'] = data['problemtype'].astype('category')
data['ln-1'] = data['ln-1'].apply(lambda x: to_float(x))
data['ln'] = data['ln'].apply(lambda x: to_float(x))
# data.memory_usage(deep=True)
# data.info(memory_usage='deep')

training = pd.read_csv(path + 'training_label.csv').drop(['AveCorrect'], axis=1)
training.columns = [col.lower() for col in training.columns]
training.shape

# merged data set with isSTEM
df = data.merge(training, on='itest_id', how="left")
#df.info()


### Filtering for the actions of the "training" students

In [4]:
actions = df[df.isstem.notnull()]
actions['skill'] = actions.skill.astype('category')
print("training set size: ", training.shape[0], "actions for # of students: ", len(actions.itest_id.unique()))
print("sample size useless for training: ", training.shape[0] - len(actions.itest_id.unique()))

# merged training data set
stem = actions[actions.isstem == 1]
nonstem = actions[actions.isstem == 0]

actions.head()

training set size:  514 actions for # of students:  326
sample size useless for training:  188


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln,schoolid,mcas,isstem
1056,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.262675,0.060808,0.0,0.889219,0.005797,0.271,0.077899,2.0,34.0,0.0
1057,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.696486,0.0,0.009561,0.108417,0.001483,0.077899,0.225856,2.0,34.0,0.0
1058,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.225856,0.483008,2.0,34.0,0.0
1059,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.483008,0.74529,2.0,34.0,0.0
1060,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.898073,0.0,0.009561,0.468252,0.001483,0.74529,0.900994,2.0,34.0,0.0


### Testing the cases
Analysing other features

In [5]:
actions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177960 entries, 1056 to 639897
Data columns (total 79 columns):
itest_id                                  177960 non-null int64
sy assistments usage                      177960 non-null category
aveknow                                   177960 non-null float64
avecarelessness                           177960 non-null float64
avecorrect                                177960 non-null float64
numactions                                177960 non-null int64
averesbored                               177960 non-null float64
averesengcon                              177960 non-null float64
averesconf                                177960 non-null float64
averesfrust                               177960 non-null float64
averesofftask                             177960 non-null float64
averesgaming                              177960 non-null float64
actionid                                  177960 non-null int64
skill                            

In [6]:
actions.head()

Unnamed: 0,itest_id,sy assistments usage,aveknow,avecarelessness,avecorrect,numactions,averesbored,averesengcon,averesconf,averesfrust,...,res_concentrating,res_confused,res_frustrated,res_offtask,res_gaming,ln-1,ln,schoolid,mcas,isstem
1056,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.262675,0.060808,0.0,0.889219,0.005797,0.271,0.077899,2.0,34.0,0.0
1057,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.696486,0.0,0.009561,0.108417,0.001483,0.077899,0.225856,2.0,34.0,0.0
1058,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.225856,0.483008,2.0,34.0,0.0
1059,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.580763,0.0,0.009561,0.108417,0.00394,0.483008,0.74529,2.0,34.0,0.0
1060,35,2004-2005,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,...,0.898073,0.0,0.009561,0.468252,0.001483,0.74529,0.900994,2.0,34.0,0.0


In [47]:
def threshold5(row):
    if row['total'] < 5:
        row['accuracy'] = 0
        return row
    row['accuracy'] = row['correct'] / row['total']
    return row

cols = ['itest_id', 'skill', 'correct']
truncated = actions[cols]
mapped_skills = dict(zip(truncated['skill'].astype('category').cat.codes.unique(), truncated['skill'].unique()))
mapped_skills
truncated['skill'] = truncated['skill'].astype('category').cat.codes
tmp = truncated
truncated.head()
truncated = truncated.groupby(['itest_id','skill'], as_index=False).count()
truncated.columns = ['itest_id', 'skill', 'total']
truncated['correct'] = tmp.groupby(['itest_id','skill'], as_index=False).sum().iloc[:, -1]
truncated = truncated.apply(lambda row: threshold5(row), axis=1)
truncated = truncated[['itest_id', 'skill', 'accuracy']]
truncated.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,itest_id,skill,accuracy
0,35.0,1.0,0.684211
1,35.0,3.0,0.0
2,35.0,7.0,0.2
3,35.0,13.0,0.315789
4,35.0,16.0,0.0


In [342]:
top_no = 5
def get_top5(row, top_no):
    sorted_row = row[1:-1].sort_values()
    return list(sorted_row.index[-top_no:])
    
pivoted = truncated.pivot(index='itest_id', columns='skill', values='accuracy')
pivoted = pivoted.fillna(0)
pivoted = pd.DataFrame(pivoted.to_records())
tmp = actions[['itest_id', 'isstem']].drop_duplicates()
pivoted = pivoted.merge(tmp, how="left", on="itest_id")
# pivoted = pivoted[pivoted.isstem==1]
pivoted['top5'] = pivoted.apply(lambda row: get_top5(row, top_no), axis=1)
pivoted = pivoted[['itest_id', 'top5', 'isstem']]
pivoted.head()

Unnamed: 0,itest_id,top5,isstem
0,35,"[1.0, 71.0, 54.0, 62.0, 24.0]",0.0
1,77,"[85.0, 26.0, 67.0, 95.0, 59.0]",0.0
2,126,"[3.0, 73.0, 38.0, 61.0, 54.0]",0.0
3,205,"[85.0, 73.0, 68.0, 64.0, 24.0]",0.0
4,283,"[62.0, 54.0, 18.0, 24.0, 95.0]",0.0


In [343]:
from mlxtend.frequent_patterns import association_rules

def add_label(row):
    el = 'stem' if row.isstem==1 else 'non_stem'
    res = list(row['top5'])
    res.append(el)
    return res

feed = pivoted
feed['with_label'] = pivoted.apply(lambda row: add_label(row), axis=1)
feed
g = feed.groupby('isstem', as_index=False)
g.size()
feed = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))

feed = list(feed['with_label'])
te = TransactionEncoder()
te_ary = te.fit(feed).transform(feed)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1.0),(non_stem),0.184932,0.5,0.116438,0.62963,1.259259,0.023973,1.35
1,(27.0),(non_stem),0.157534,0.5,0.10274,0.652174,1.304348,0.023973,1.4375
2,(54.0),(stem),0.178082,0.5,0.109589,0.615385,1.230769,0.020548,1.3
3,(58.0),(non_stem),0.205479,0.5,0.123288,0.6,1.2,0.020548,1.25


### For the team

In [413]:
pivoted2 = truncated.pivot(index='itest_id', columns='skill', values='accuracy')
pivoted2 = pivoted2.fillna(0)
pivoted2 = pd.DataFrame(pivoted2.to_records())
pivoted2 = pivoted2

for idx, table in pivoted2.iterrows():
    sorted_row = list(table)[1:]
    sorted_row.sort()
    threshold = sorted_row[-top_no]
    lst = [1 if (item > threshold and idx != 0) else 0 for idx, item in enumerate(list(table)[1:])]
    res = [list(table)[0]]
    res.extend(lst)
    print(res)
    break

[35.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
