In [71]:
import pandas as pd
import numpy as np
import math
import matplotlib as mlp
import matplotlib.pyplot as plt
import sklearn.linear_model as skm
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold,GridSearchCV
from sklearn.feature_selection import SelectFromModel

First, we read the training and test set.

In [72]:
X_test=pd.read_csv("sets/X_test.csv")
X_train=pd.read_csv("sets/X_train.csv")
y_test=pd.read_csv("sets/y_test.csv")['score']
y_train=pd.read_csv("sets/y_train.csv")['score']
print(X_test.columns)

Index(['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'date_registration', 'forumng_clicks', 'homepage_clicks',
       'oucontent_clicks', 'resource_clicks', 'subpage_clicks', 'url_clicks',
       'dataplus_clicks', 'glossary_clicks', 'oucollaborate_clicks',
       'quiz_clicks', 'ouelluminate_clicks', 'sharedsubpage_clicks',
       'questionnaire_clicks', 'page_clicks', 'externalquiz_clicks',
       'ouwiki_clicks', 'dualpane_clicks', 'folder_clicks',
       'repeatactivity_clicks', 'htmlactivity_clicks', 'assessment_type',
       'weight', 'submission_delay'],
      dtype='object')


We choose the amount of classes we wount to divide the target values into and reassigne the values in the target sets.

Let $c$ be the number of classes, each value $v$, belonging to the set $V$ that we want to modify, will be reassigned as follows:

$$
v=\left\lceil\frac{v-\min{V}}{\frac{\max{V}-\min{V}}{c}}\right\rceil
$$

In [73]:
classes=5

def classesAssignment(nClasses, set, maximum, minimum):
    """
    Modifies the set in input such that values will be rounded off to the corresponding class.
     Args:
     nClasses: the expected quantity of values
     set: the set that has to be modified
     maximum: the maximum possible value in the set
     minimum: the minimum possible value in the set
    """

    retSet=np.ceil((set-minimum)/((maximum-minimum)/nClasses))
    
    for i in range(len(retSet)) :
        if retSet[i]==0 :
            retSet[i]=1

    return retSet


maximum=max(y_test.max(),y_train.max())
minimum=min(y_test.min(),y_train.min())

y_test1=classesAssignment(classes,y_test,maximum,minimum)
y_train1=classesAssignment(classes,y_train,maximum,minimum)

Now we set up the model.

In [74]:
lrm = skm.LogisticRegression(multi_class='ovr',max_iter=10000,random_state=2)

print(X_train.columns)

Index(['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'date_registration', 'forumng_clicks', 'homepage_clicks',
       'oucontent_clicks', 'resource_clicks', 'subpage_clicks', 'url_clicks',
       'dataplus_clicks', 'glossary_clicks', 'oucollaborate_clicks',
       'quiz_clicks', 'ouelluminate_clicks', 'sharedsubpage_clicks',
       'questionnaire_clicks', 'page_clicks', 'externalquiz_clicks',
       'ouwiki_clicks', 'dualpane_clicks', 'folder_clicks',
       'repeatactivity_clicks', 'htmlactivity_clicks', 'assessment_type',
       'weight', 'submission_delay'],
      dtype='object')


And train it.

In [75]:
lrm.fit(X_train,y_train1)

We can then check the accuracy.

In [82]:
preds=lrm.predict(X_test)

check=preds==y_test1
for i in range(len(check)):
    if not check[i]:
        continue
        print (preds[i],y_test1[i])

def perf_measure(y_actual, y_hat):
   TP = 0
   FP = 0
   TN = 0
   FN = 0

   for i in range(len(y_hat)): 
      if y_actual[i]==y_hat[i]==1:
         TP += 1
      if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
         FP += 1
      if y_actual[i]==y_hat[i]==0:
        TN += 1
      if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
        FN += 1
   print(i)
   return(TP, FP, TN, FN)

accuracy=(sum(preds==y_test1))/len(preds)
print("Accuracy:", accuracy)
print(classification_report(y_test1, preds))

preds_train=lrm.predict(X_train)
check=preds_train==y_train1
accuracy = accuracy_score(y_train1, preds_train)
print("Accuracy training:", accuracy)
print(classification_report(y_train1, preds_train))

Accuracy: 0.5065487483934146
              precision    recall  f1-score   support

         1.0       1.00      0.00      0.00       506
         2.0       0.00      0.00      0.00      1228
         3.0       0.30      0.01      0.02      4477
         4.0       0.45      0.54      0.49     12358
         5.0       0.56      0.69      0.62     14109

    accuracy                           0.51     32678
   macro avg       0.46      0.25      0.23     32678
weighted avg       0.47      0.51      0.46     32678

Accuracy training: 0.5055275459226221
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      2033
         2.0       0.06      0.00      0.00      5107
         3.0       0.35      0.01      0.03     18013
         4.0       0.45      0.54      0.49     49182
         5.0       0.56      0.69      0.62     56374

    accuracy                           0.51    130709
   macro avg       0.28      0.25      0.23    130709
weighted a

In [98]:
print(confusion_matrix(y_test1, preds))
#coefficients=lrm.coef_
sfm = SelectFromModel(lrm, threshold=1e-0)
sfm.fit(X_train, y_train1)

# Selected features
selected_features = X_train.columns[sfm.get_support()]

# Display selected features
print("Selected Features:", selected_features)

[[   1    0    2  365  138]
 [   0    0   27  768  433]
 [   0    0   53 2765 1659]
 [   0    3   74 6694 5587]
 [   0    1   19 4284 9805]]
Selected Features: Index(['forumng_clicks', 'dataplus_clicks', 'questionnaire_clicks',
       'ouwiki_clicks', 'assessment_type', 'weight', 'submission_delay'],
      dtype='object')


In [68]:
features=[]

for feature in X_test.columns:
    features.append(feature)

featuresC=features.copy()

accMax=0
accMin=200

for feature in features:
    #print("E' IL TURNO DELLA FEATURE ",feature)
    featuresC.remove(feature)

    Xtest=X_test[featuresC]
    Xtrain=X_train[featuresC]
    lrm = skm.LogisticRegression(multi_class='multinomial',max_iter=10000,random_state=2)
    lrm.fit(Xtrain,y_train1)

    preds=lrm.predict(Xtest)

    check=preds==y_test1
    accuracy = accuracy_score(y_test1, preds)
   # print("Accuracy:", accuracy)
    #print(classification_report(y_test, preds))

    if accuracy>accMax:
        accMax=accuracy
        featureMax=feature
    if accuracy<accMin:
        accMin=accuracy
        featureMin=feature

    featuresC.append(feature)    

print("FeatureMax è: ",featureMax," con ",accMax)
print("FeatureMin è: ",featureMin," con ",accMin)

KeyboardInterrupt: 

Log for search of best set of features:

1:
    FeatureMax è:  forumng_clicks  con  0.31868535406083603  
    FeatureMax è:  assessment_type  con  0.30681192239427135

remove forumng_clicks

2:
    FeatureMax è:  studied_credits  con  0.3192667849929616  
    FeatureMax è:  assessment_type  con  0.30616928820613254    

remove studied_credits

3:
    FeatureMax è:  page_clicks  con  0.319787012669074   
    FeatureMax è:  assessment_type  con  0.3072709468143705

remove page_clicks

4:
    FeatureMax è:  dualpane_clicks  con  0.3201236305771467    
    FeatureMax è:  assessment_type  con  0.3078523777464961

remove dualpane_clicks

5:
    FeatureMax è:  num_of_prev_attempts  con  0.32021543546116654  
    FeatureMax è:  assessment_type  con  0.30806658914254237

remove num_of_prev_attempts

6:  
    FeatureMax è:  url_clicks  con  0.32021543546116654  
    FeatureMax è:  assessment_type  con  0.30837260542260847




ROUND 2

1:
    FeatureMax è:  dataplus_clicks  con  0.3179509149886774  
    FeatureMax è:  assessment_type  con  0.3084644103066283

2:
    FeatureMax è:  region  con  0.31807332150070383  
FeatureMin è:  assessment_type  con  0.30803598751453576

3:
    FeatureMax è:  dualpane_clicks  con  0.3181957280127303  
FeatureMin è:  assessment_type  con  0.30803598751453576

4:
    FeatureMax è:  disability  con  0.3182263296407369  
FeatureMin è:  assessment_type  con  0.30806658914254237

ROUND 3

1:FeatureMax è:  studied_credits  con  0.3115245731072893  
FeatureMin è:  assessment_type  con  0.3017014505171675

2:FeatureMax è:  submission_delay  con  0.3123508170634678  
FeatureMin è:  assessment_type  con  0.3034151416855377




Most of the runs give a result such as the following:

Accuracy is:  0.32027663871717976

              precision    recall  f1-score   support

         1.0       0.43      0.08      0.14       287
         2.0       0.00      0.00      0.00       219
         3.0       0.00      0.00      0.00       343
         4.0       0.14      0.00      0.00       885
         5.0       0.10      0.01      0.01      1368
         6.0       0.21      0.01      0.02      3109
         7.0       0.18      0.04      0.06      4508
         8.0       0.27      0.43      0.33      7850
         9.0       0.31      0.37      0.34      6808
        10.0       0.41      0.59      0.48      7301

    accuracy                           0.32     32678
   macro avg       0.21      0.15      0.14     32678
weighted avg       0.28      0.32      0.27     32678


This approach didn't work really well. We can see that received an evaluation below 70 almost never get identifies. This is probably due to the fact that these student have less representation, compared to student that have recevied a score of more than 70. 

Let's try another approach: divide the students in classe, not solely based on their scores, but also based on how many students received a similar score.



In [58]:

def fairClassesAssignment(nClasses, set, maximum, minimum):
    """
    Modifies the set in input such that values will be rounded off and divided in classes in a more
    distributed (and approcimately fair) fashion.
     Args:
     nClasses: the expected quantity of values
     set: the set that has to be modified
     maximum: the maximum possible value in the set
     minimum: the minimum possible value in the set
    """
    set += np.abs(minimum)
    retSet = np.zeros(len(set))

    setIndex = sorted(list(zip(set,range(len(set)))))

    binSize=len(set)/nClasses
    
    for i in range(len(set)):
        retSet[setIndex[i][1]]=min((i//binSize)+1,nClasses)

    return retSet


maximum=max(y_test.max(),y_train.max())
minimum=min(y_test.min(),y_train.min())

y_test2=fairClassesAssignment(classes,y_test,maximum,minimum)
y_train2=fairClassesAssignment(classes,y_train,maximum,minimum)

stuff1=np.array([y_test,y_test2])
stuff2=np.array([y_test,y_test2])
for i in range(1,classes+1):
    fstf1=stuff1[0, stuff1[1, :] == i]
    fstf2=stuff2[0, stuff2[1, :] == i]


Now that classes are balanced, let's try again logistic regression.

In [59]:
lrm = skm.LogisticRegression(multi_class='multinomial',max_iter=10000,random_state=2)

lrm.fit(X_train,y_train2)

preds=lrm.predict(X_test)

check=preds==y_test2
for i in range(len(check)):
    if not check[i]:
        continue
        print (preds[i],y_test2[i])
accuracy = accuracy_score(y_test2, preds)
print("Accuracy:", accuracy)
print(classification_report(y_test2, preds))

preds_train=lrm.predict(X_train)
check=preds_train==y_train2
accuracy = accuracy_score(y_train2, preds_train)
print("Accuracy training:", accuracy)
print(classification_report(y_train2, preds_train))

Accuracy: 0.3408409327376216
              precision    recall  f1-score   support

         1.0       0.34      0.46      0.39      6536
         2.0       0.28      0.26      0.27      6536
         3.0       0.24      0.06      0.10      6535
         4.0       0.34      0.36      0.35      6536
         5.0       0.40      0.56      0.47      6535

    accuracy                           0.34     32678
   macro avg       0.32      0.34      0.32     32678
weighted avg       0.32      0.34      0.32     32678

Accuracy training: 0.34232531807297123
              precision    recall  f1-score   support

         1.0       0.35      0.47      0.40     26142
         2.0       0.28      0.26      0.27     26142
         3.0       0.25      0.06      0.10     26142
         4.0       0.34      0.35      0.34     26142
         5.0       0.40      0.56      0.47     26141

    accuracy                           0.34    130709
   macro avg       0.32      0.34      0.32    130709
weighted 

TRAIN WITH INITIAL DISTRIBUTION GRID SEARCH
result: useless

In [61]:
solvers = ['newton-cg', 'lbfgs', 'saga','sag','liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# summarize results
for solver in solvers:
    for pen in penalty:
        for value in c_values:
            lrm = skm.LogisticRegression(multi_class='multinomial',max_iter=10000,random_state=2,solver=solver,
                                         penalty=pen,C=value)
            print("-------------------------------------------")
            lrm.fit(X_train,y_train1)
            preds=lrm.predict(X_test)
            check=preds==y_test1
            print("solver=",solver,"penalty=",pen,"value=",value)
            accuracy = accuracy_score(y_test1, preds)
            print("Accuracy:", accuracy)
            print(classification_report(y_test1, preds))

            preds_train=lrm.predict(X_train)
            check=preds_train==y_train1
            accuracy = accuracy_score(y_train1, preds_train)
            print("Accuracy training:", accuracy)
            print(classification_report(y_train1, preds_train))

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    
    print("%f (%f) with: %r" % (mean, stdev, param))

-------------------------------------------
solver= newton-cg penalty= l2 value= 100
Accuracy: 0.5051716751331171
              precision    recall  f1-score   support

         1.0       1.00      0.00      0.00       506
         2.0       0.11      0.00      0.00      1228
         3.0       0.33      0.02      0.03      4477
         4.0       0.45      0.56      0.50     12358
         5.0       0.56      0.67      0.61     14109

    accuracy                           0.51     32678
   macro avg       0.49      0.25      0.23     32678
weighted avg       0.48      0.51      0.46     32678

Accuracy training: 0.5042116457168213
              precision    recall  f1-score   support

         1.0       0.17      0.00      0.00      2033
         2.0       0.05      0.00      0.00      5107
         3.0       0.35      0.02      0.04     18013
         4.0       0.45      0.56      0.50     49182
         5.0       0.56      0.67      0.61     56374

    accuracy                     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      2033
         2.0       0.06      0.00      0.00      5107
         3.0       0.35      0.02      0.03     18013
         4.0       0.45      0.56      0.50     49182
         5.0       0.56      0.67      0.61     56374

    accuracy                           0.50    130709
   macro avg       0.28      0.25      0.23    130709
weighted avg       0.46      0.50      0.46    130709

-------------------------------------------
solver= lbfgs penalty= l2 value= 100
Accuracy: 0.5051410735051105
              precision    recall  f1-score   support

         1.0       1.00      0.00      0.00       506
         2.0       0.11      0.00      0.00      1228
         3.0       0.33      0.02      0.03      4477
         4.0       0.45      0.56      0.50     12358
         5.0       0.56      0.67      0.61     14109

    accuracy                           0.51     32678
   macro avg       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      2033
         2.0       0.06      0.00      0.00      5107
         3.0       0.35      0.02      0.03     18013
         4.0       0.45      0.56      0.50     49182
         5.0       0.56      0.67      0.61     56374

    accuracy                           0.50    130709
   macro avg       0.28      0.25      0.23    130709
weighted avg       0.46      0.50      0.46    130709

-------------------------------------------
solver= saga penalty= l2 value= 100
Accuracy: 0.5051716751331171
              precision    recall  f1-score   support

         1.0       1.00      0.00      0.00       506
         2.0       0.11      0.00      0.00      1228
         3.0       0.33      0.02      0.03      4477
         4.0       0.45      0.56      0.50     12358
         5.0       0.56      0.67      0.61     14109

    accuracy                           0.51     32678
   macro avg       0.4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      2033
         2.0       0.06      0.00      0.00      5107
         3.0       0.35      0.02      0.03     18013
         4.0       0.45      0.56      0.50     49182
         5.0       0.56      0.67      0.61     56374

    accuracy                           0.50    130709
   macro avg       0.28      0.25      0.23    130709
weighted avg       0.46      0.50      0.46    130709

-------------------------------------------
solver= sag penalty= l2 value= 100
Accuracy: 0.5051716751331171
              precision    recall  f1-score   support

         1.0       1.00      0.00      0.00       506
         2.0       0.11      0.00      0.00      1228
         3.0       0.33      0.02      0.03      4477
         4.0       0.45      0.56      0.50     12358
         5.0       0.56      0.67      0.61     14109

    accuracy                           0.51     32678
   macro avg       0.49

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      2033
         2.0       0.06      0.00      0.00      5107
         3.0       0.35      0.02      0.03     18013
         4.0       0.45      0.56      0.50     49182
         5.0       0.56      0.67      0.61     56374

    accuracy                           0.50    130709
   macro avg       0.28      0.25      0.23    130709
weighted avg       0.46      0.50      0.46    130709

-------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


ValueError: Solver liblinear does not support a multinomial backend.

HYPOTHESIS FOR USEFUL FEATURES:

-assessment_type

-highest education

-weight













IDEAS

show a map of most influential features compared to grades