# Preprocessing of the skill sets
After applying K-means (in skill_kmean.ipynb), here we prerocess the clustering even further and construct an id_map dataframe to be used later by our pipeline in the  model fitting and evaluation phases.

In [53]:
import numpy as np
from copy import deepcopy

def merge_skills(skill_sets):
    """Mereges skills from Number_repr and Number_line into one array without duplicates

    Args:
        skill_sets (dict): dictionary with skill names as keys and values of arrays of skill 

    Returns:
        _type_: _description_
    """
    skill_merge = deepcopy(skill_sets)
    # merge arrays of Number_repr and Number_line into one array without duplicates
    skill_merge['Number_line'] = np.unique(np.concatenate((skill_merge['Number_repr'], skill_merge['Number_line'])))
    del skill_merge['Number_repr']
    return skill_merge


We perform a merge on the Numner_line and Number_repre clusters to keep only one global skill under the name Number_line

In [54]:
import pickle
import pprint


#Read and pretty print content of pickle file skill_sets.pkl
with open('skill_sets.pkl', 'rb') as f:
    skill_sets = pickle.load(f)
    skill_sets = merge_skills(skill_sets)


pprint.pprint(skill_sets)

{'Addition': array(['0-1000 addition 3,1 tc', '0-1000 support addition 3,1 tc',
       '0-1000 support addition 3,2',
       '0-10 support addition/subtraction structured 1',
       '0-100 addition 2,2', '0-1000 support addition 3,3 tc',
       '0-20 support addition tc', '0-20 addition',
       '0-100 support addition 2,2 tc', '0-1000 addition 3,3',
       '0-20 addition tc', '0-1000 support addition 3,3',
       '0-100 addition 2,1 tc', '0-1000 addition 3,2 tc, hc',
       '0-100 support addition 2,2', '0-1000 addition 3,3 hc',
       '0-10 addition 1,1', '0-1000 support addition 3,2 tc',
       '0-100 support addition 2,1 tc', '0-100 addition 2,2 tc',
       '0-1000 addition 3,1', '0-1000 support addition 3,1 hc',
       '0-1000 addition 3,2 hc',
       '0-10 support addition/subtraction structured 2',
       '0-1000 support addition 3,2 hc', '0-20 addition 2,1',
       '0-1000 addition 3,1 hc', '0-1000 support addition 3,3 tc, hc',
       '0-1000 addition 3,3 tc', '0-1000 support a

In [55]:
#Read and pretty print content of pickle file skill_sets_ids.pkl
with open('skill_sets_ids.pkl', 'rb') as f:
    skill_sets_ids = pickle.load(f)
    skill_sets_ids = merge_skills(skill_sets_ids)

pprint.pprint(skill_sets_ids, compact=True)


{'Addition': [186, 184, 191, 21, 125, 213, 57, 63, 130, 216, 58, 212, 121, 205,
              123, 218, 26, 193, 119, 132, 185, 192, 204, 22, 194, 54, 201, 215,
              217, 195, 202, 23, 183, 59, 214, 116, 219, 53, 114, 203],
 'Division': [162, 242, 249, 156, 87, 85, 241, 88, 245, 158, 247, 157, 155, 90,
              246, 243, 248, 91, 83, 160, 240, 251, 163, 159, 244, 86, 84, 161,
              82, 89],
 'Multiplication': [148, 141, 149, 236, 140, 78, 76, 238, 73, 77, 144, 237, 153,
                    150, 75, 139, 143, 142, 152, 80, 235, 74, 154, 239, 147, 79,
                    146, 233, 151, 250, 232, 138, 234, 81, 145],
 'Number_line': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  25,  28,  29,  30,  31,
        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
        45,  46,  47,  48,  49,  50,  51,  52,  67,  68,  69,  70,  71,
        72,  92,  93,  94,  95,  96,  97,  98,  99, 1

After manual checking, We also note that the clustering of skills is not perfect (hence the wish for the word embedding as a further extension),
Multiplicative skills were misclassified as Number_line, we migrate those to the adequate Multiplication skilset.
The misclassified skills are as portrayed by the next cell.

| Range | Name                      | ID   |
|-------|---------------------------|------|
| 0-100 | Multiplicative Operation  | 44   |
| 0-1000| Multiplicative Operation  | 100  |
| 0-20  | Multiplicative Operation  | 172  |


In [56]:
# Check if string values of key 'Number_line' of  skill_sets contain "multiplicative"
d_multi = {}
for i, skill in enumerate(skill_sets['Number_line']):
    if "multiplicative" in skill:
        print(skill, skill_sets_ids['Number_line'][i]) 
        idx = skill_sets_ids['Number_line'][i]
        d_multi[idx] = skill

# Add values from d_multi to skill_sets['Multiplication'] and Add keys from d_multi to skill_sets_ids['Multiplication']
skill_sets['Multiplication'] = np.unique(np.append(skill_sets['Multiplication'], list(d_multi.values())))
skill_sets_ids['Multiplication'] += list(d_multi.keys())



0-100 multiplicative operation 44
0-1000 multiplicative operation 100
0-20 multiplicative operation 172


In [57]:
#Print skill_sets['Multiplication'] and skill_sets_ids['Multiplication']
pprint.pprint(skill_sets['Multiplication'])
pprint.pprint(skill_sets_ids['Multiplication'], compact=True)

array(['0-100 multiplication calculate',
       '0-100 multiplication calculate 10x10',
       '0-100 multiplication table 1', '0-100 multiplication table 10',
       '0-100 multiplication table 2', '0-100 multiplication table 3',
       '0-100 multiplication table 3,4,6,7,8,9',
       '0-100 multiplication table 4', '0-100 multiplication table 5',
       '0-100 multiplication table 6', '0-100 multiplication table 7',
       '0-100 multiplication table 8', '0-100 multiplication table 9',
       '0-100 multiplication tables 1,2,5,10',
       '0-100 multiplicative operation', '0-100 notation multiplication',
       '0-100 presupport multiplication', '0-100 support multiplication',
       '0-1000 free multiplication calculation',
       '0-1000 multiplication calculate',
       '0-1000 multiplication calculate 15x15',
       '0-1000 multiplication table 10',
       '0-1000 multiplication table 10,11,12,15,20',
       '0-1000 multiplication table 11', '0-1000 multiplication table 12',
    

In [58]:
# Remove values from d_multi to skill_sets['Number_line'] and remove keys from d_multi to skill_sets_ids['Number_line']
for id, skill in d_multi.items():
    skill_sets['Number_line'] = np.delete(skill_sets['Number_line'], np.where(skill_sets['Number_line'] == skill))

    # remove id from list skill_sets_ids['Number_line']
    skill_sets_ids['Number_line'] = list(filter(lambda a: a != id, skill_sets_ids['Number_line']))

    

pprint.pprint(skill_sets['Number_line'], compact=True)
pprint.pprint(skill_sets_ids['Number_line'], compact=True)

array(['0-10 arabic', '0-10 arabic -> concrete',
       '0-10 arabic -> numberline 1', '0-10 arabic -> numberline 2',
       '0-10 arabic -> structured', '0-10 complete to 10 arabic',
       '0-10 complete to 10 structured', '0-10 concrete',
       '0-10 concrete -> arabic', '0-10 counting', '0-10 larger a',
       '0-10 larger cv', '0-10 larger cva', '0-10 math facts',
       '0-10 numberline', '0-10 operation 1,1', '0-10 ordinal 1',
       '0-10 ordinal 2', '0-10 ordinal 3', '0-10 relative', '0-10 sets',
       '0-10 structured sets', '0-10 subitizing', '0-10 verbal',
       '0-10 verbal -> arabic', '0-10 verbal -> numberline',
       '0-100 arabic', '0-100 arabic -> concrete',
       '0-100 arabic -> numberline 1', '0-100 arabic -> numberline 2',
       '0-100 complete to 100 arabic', '0-100 concrete',
       '0-100 concrete -> arabic', '0-100 concrete -> numberline',
       '0-100 counting', '0-100 estimation', '0-100 larger a',
       '0-100 larger estimate', '0-100 numberline',
 

In [59]:
#Save skill_sets and skill_sets_ids as pickle files
with open('skill_sets_merged.pkl', 'wb') as f:
    pickle.dump(skill_sets, f)
    
#Save skill_sets and skill_sets_ids as pickle files
with open('skill_sets_ids_merged.pkl', 'wb') as f:
    pickle.dump(skill_sets_ids, f)

### In the folowing section we construct the skills data frame: id_map

In [28]:
import numpy as np
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(indent=4, compact=True) 

In [47]:
skill_ids_map: dict = pd.read_pickle('skill_sets_ids_merged.pkl')
id_map = pd.read_json('data/04_calcularis_skill_map_labels.json')
len_ids = len(id_map.index)

#rename index of id_map as skill_id
id_map.index.names = ['skill_id']

#rename column 'labels' of id_map as 'label'
id_map.rename(columns={'labels': 'label'}, inplace=True)

# Add a new column to id_map with the values of keys from skill_ids_map for which skill_id of id_map is any values of skill_ids_map
for id in id_map.index:
    for skill_name, ids in skill_ids_map.items():
        if id in ids:
            range = id_map.loc[id, 'label'].split(' ')[0].split('-')[-1]
            number_range = 'R' + range
            id_map.loc[id, 'skill_name'] = skill_name
            id_map.loc[id, 'number_range'] = number_range
            break
    
    
pp.pprint(id_map)

# Assert no null names nd no missing ids
assert np.all(~id_map['skill_name'].isnull()) 
assert len(id_map) == len_ids


                                          label      skill_name number_range
skill_id                                                                    
0                                 0-10 Concrete     Number_line          R10
1                               0-10 Subitizing     Number_line          R10
10                               0-10 Larger CV     Number_line          R10
100       0-100 Numberline -> Estimation/Arabic  Multiplication         R100
101                                0-100 Arabic     Number_line         R100
...                                         ...             ...          ...
95                             0-100 Estimation     Number_line         R100
96                                 0-100 Verbal     Number_line         R100
97                     0-100 Concrete -> Arabic     Number_line         R100
98                       0-100 Verbal -> Arabic     Number_line         R100
99                     0-100 Arabic -> Concrete     Number_line         R100

In [48]:
# Sort id_map by skill_id
id_map.sort_index(inplace=True)

In [50]:
#save id_map as pickle file
id_map.to_pickle('data/id_map.pkl')

In [51]:
id_map

Unnamed: 0_level_0,label,skill_name,number_range
skill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0-10 Concrete,Number_line,R10
1,0-10 Subitizing,Number_line,R10
2,0-10 Verbal,Number_line,R10
3,0-10 Concrete -> Arabic,Number_line,R10
4,0-10 Verbal -> Arabic,Number_line,R10
...,...,...,...
248,0-1000 Free Division Calculation,Division,R1000
249,0-1000 Free Division by Multiplication Calcula...,Division,R1000
250,0-1000 Free Multiplication Calculation,Multiplication,R1000
251,0-1000 Free Multiplication/Division,Division,R1000
