In [611]:
import pandas as pd
import numpy as np
import math
import matplotlib as mlp
import matplotlib.pyplot as plt
import sklearn.linear_model as skm
from sklearn.metrics import accuracy_score, classification_report

First, we read the training and test set.

In [612]:
X_test=pd.read_csv("sets/X_test.csv").drop("Unnamed: 0",axis=1)
X_train=pd.read_csv("sets/X_train.csv").drop("Unnamed: 0",axis=1)
y_test=pd.read_csv("sets/y_test.csv")['score']
y_train=pd.read_csv("sets/y_train.csv")['score']
print(X_test.columns)

Index(['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'date_registration', 'forumng_clicks', 'homepage_clicks',
       'oucontent_clicks', 'resource_clicks', 'subpage_clicks', 'url_clicks',
       'dataplus_clicks', 'glossary_clicks', 'oucollaborate_clicks',
       'quiz_clicks', 'ouelluminate_clicks', 'sharedsubpage_clicks',
       'questionnaire_clicks', 'page_clicks', 'externalquiz_clicks',
       'ouwiki_clicks', 'dualpane_clicks', 'folder_clicks',
       'repeatactivity_clicks', 'htmlactivity_clicks', 'assessment_type',
       'weight', 'submission_delay'],
      dtype='object')


We choose the amount of classes we wount to divide the target values into and reassigne the values in the target sets.

Let $c$ be the number of classes, each value $v$, belonging to the set $V$ that we want to modify, will be reassigned as follows:

$$
v=\left\lceil\frac{v-\min{V}}{\frac{\max{V}-\min{V}}{c}}\right\rceil
$$

In [613]:
classes=10

def classesAssignment(nClasses, set, maximum, minimum):
    """
    Modifies the set in input such that values will be rounded off.
     Args:
     nClasses: the expected quantity of values
     set: the set that has to be modified
     maximum: the maximum possible value in the set
     minimum: the minimum possible value in the set
    """

    retSet=np.ceil((set-minimum)/((maximum-minimum)/nClasses))
    
    for i in range(len(retSet)) :
        if retSet[i]==0 :
            retSet[i]=1

    return retSet


maximum=max(y_test.max(),y_train.max())
minimum=min(y_test.min(),y_train.min())

y_test=classesAssignment(classes,y_test,maximum,minimum)
y_train=classesAssignment(classes,y_train,maximum,minimum)

Now we set up the model.

In [614]:
lrm = skm.LogisticRegression(multi_class='multinomial',max_iter=10000,random_state=2)

print(X_train.columns)
X_train=X_train[['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'date_registration', 'forumng_clicks', 'homepage_clicks',
       'oucontent_clicks', 'resource_clicks', 'subpage_clicks', 'url_clicks',
       'dataplus_clicks', 'glossary_clicks', 'oucollaborate_clicks',
       'quiz_clicks', 'ouelluminate_clicks', 'sharedsubpage_clicks',
       'questionnaire_clicks', 'page_clicks', 'externalquiz_clicks',
       'ouwiki_clicks', 'dualpane_clicks', 'folder_clicks',
       'repeatactivity_clicks', 'htmlactivity_clicks', 'assessment_type',
       'weight', 'submission_delay']]

X_test=X_test[['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'date_registration', 'forumng_clicks', 'homepage_clicks',
       'oucontent_clicks', 'resource_clicks', 'subpage_clicks', 'url_clicks',
       'dataplus_clicks', 'glossary_clicks', 'oucollaborate_clicks',
       'quiz_clicks', 'ouelluminate_clicks', 'sharedsubpage_clicks',
       'questionnaire_clicks', 'page_clicks', 'externalquiz_clicks',
       'ouwiki_clicks', 'dualpane_clicks', 'folder_clicks',
       'repeatactivity_clicks', 'htmlactivity_clicks', 'assessment_type',
       'weight', 'submission_delay']]

Index(['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'date_registration', 'forumng_clicks', 'homepage_clicks',
       'oucontent_clicks', 'resource_clicks', 'subpage_clicks', 'url_clicks',
       'dataplus_clicks', 'glossary_clicks', 'oucollaborate_clicks',
       'quiz_clicks', 'ouelluminate_clicks', 'sharedsubpage_clicks',
       'questionnaire_clicks', 'page_clicks', 'externalquiz_clicks',
       'ouwiki_clicks', 'dualpane_clicks', 'folder_clicks',
       'repeatactivity_clicks', 'htmlactivity_clicks', 'assessment_type',
       'weight', 'submission_delay'],
      dtype='object')


And train it.

In [615]:
lrm.fit(X_train,y_train)

We can then check the accuracy.

In [616]:
preds=lrm.predict(X_test)

check=preds==y_test
for i in range(len(check)):
    if not check[i]:
        continue
        print (preds[i],y_test[i])
accuracy=(sum(preds==y_test))/len(preds)
print("Accuracy is: ",accuracy)
accuracy = sklearn.metrics.accuracy_score(y_test, preds)
print("Accuracy:", accuracy)
print(classification_report(y_test, preds))

Accuracy is:  0.29233735234714486
Accuracy: 0.29233735234714486
              precision    recall  f1-score   support

         1.0       0.40      0.01      0.01       287
         2.0       0.00      0.00      0.00       219
         3.0       0.00      0.00      0.00       343
         4.0       0.00      0.00      0.00       885
         5.0       0.00      0.00      0.00      1368
         6.0       0.11      0.00      0.00      3109
         7.0       0.17      0.00      0.01      4508
         8.0       0.25      0.56      0.35      7850
         9.0       0.32      0.33      0.32      6808
        10.0       0.35      0.40      0.37      7301

    accuracy                           0.29     32678
   macro avg       0.16      0.13      0.11     32678
weighted avg       0.24      0.29      0.24     32678



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [617]:
features=[]

for feature in X_test.columns:
    features.append(feature)

featuresC=features.copy()

accMax=0
accMin=200

for feature in features:
    print("E' IL TURNO DELLA FEATURE ",feature)
    featuresC.remove(feature)

    Xtest=X_test[featuresC]
    Xtrain=X_train[featuresC]
    lrm = skm.LogisticRegression(multi_class='multinomial',max_iter=10000,random_state=2)
    lrm.fit(Xtrain,y_train)

    preds=lrm.predict(Xtest)

    check=preds==y_test
    accuracy=(sum(preds==y_test))/len(preds)
    print("Accuracy is: ",accuracy)
    accuracy = sklearn.metrics.accuracy_score(y_test, preds)
    print("Accuracy:", accuracy)
    #print(classification_report(y_test, preds))

    if accuracy>accMax:
        accMax=accuracy
        featureMax=feature
    if accuracy<accMin:
        accMin=accuracy
        featureMin=feature

    featuresC.append(feature)    

print("FeatureMax è: ",featureMax," con ",accMax)
print("FeatureMin è: ",featureMin," con ",accMin)

E' IL TURNO DELLA FEATURE  gender
Accuracy is:  0.29258216537119774
Accuracy: 0.29258216537119774
E' IL TURNO DELLA FEATURE  region
Accuracy is:  0.2930717914193035
Accuracy: 0.2930717914193035
E' IL TURNO DELLA FEATURE  highest_education
Accuracy is:  0.2939286370034886
Accuracy: 0.2939286370034886
E' IL TURNO DELLA FEATURE  imd_band
Accuracy is:  0.29291878327927046
Accuracy: 0.29291878327927046
E' IL TURNO DELLA FEATURE  age_band
Accuracy is:  0.2912968969949201
Accuracy: 0.2912968969949201
E' IL TURNO DELLA FEATURE  num_of_prev_attempts
Accuracy is:  0.29169471815900605
Accuracy: 0.29169471815900605
E' IL TURNO DELLA FEATURE  studied_credits
Accuracy is:  0.2918171246710325
Accuracy: 0.2918171246710325
E' IL TURNO DELLA FEATURE  disability
Accuracy is:  0.29264336862721096
Accuracy: 0.29264336862721096
E' IL TURNO DELLA FEATURE  date_registration
Accuracy is:  0.29175592141501927
Accuracy: 0.29175592141501927
E' IL TURNO DELLA FEATURE  forumng_clicks
Accuracy is:  0.292031336067078

Log for search of best set of features:

1:
    FeatureMax è:  forumng_clicks  con  0.31868535406083603  
    FeatureMax è:  assessment_type  con  0.30681192239427135

remove forumng_clicks

2:
    FeatureMax è:  studied_credits  con  0.3192667849929616  
    FeatureMax è:  assessment_type  con  0.30616928820613254    

remove studied_credits

3:
    FeatureMax è:  page_clicks  con  0.319787012669074   
    FeatureMax è:  assessment_type  con  0.3072709468143705

remove page_clicks

4:
    FeatureMax è:  dualpane_clicks  con  0.3201236305771467    
    FeatureMax è:  assessment_type  con  0.3078523777464961

remove dualpane_clicks

5:
    FeatureMax è:  num_of_prev_attempts  con  0.32021543546116654  
    FeatureMax è:  assessment_type  con  0.30806658914254237

remove num_of_prev_attempts

6:  
    FeatureMax è:  url_clicks  con  0.32021543546116654  
    FeatureMax è:  assessment_type  con  0.30837260542260847




ROUND 2

1:
    FeatureMax è:  dataplus_clicks  con  0.3179509149886774  
    FeatureMax è:  assessment_type  con  0.3084644103066283

2:
    FeatureMax è:  region  con  0.31807332150070383  
FeatureMin è:  assessment_type  con  0.30803598751453576

3:
    FeatureMax è:  dualpane_clicks  con  0.3181957280127303  
FeatureMin è:  assessment_type  con  0.30803598751453576

4:
    FeatureMax è:  disability  con  0.3182263296407369  
FeatureMin è:  assessment_type  con  0.30806658914254237

ROUND 3

1:FeatureMax è:  studied_credits  con  0.3115245731072893  
FeatureMin è:  assessment_type  con  0.3017014505171675

2:FeatureMax è:  submission_delay  con  0.3123508170634678  
FeatureMin è:  assessment_type  con  0.3034151416855377



HYPOTHESIS FOR USEFUL FEATURES:

-assessment_type

-highest education

-weight













IDEAS

show a map of most influential features compared to grades