In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [5]:
df = pd.read_csv('dataset/students_performance.csv')
df


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [6]:
for col in df.columns:
    print(col)


gender
race/ethnicity
parental level of education
lunch
test preparation course
math score
reading score
writing score


In [7]:
scores_avg = {'reading score': 0,
              'writing score': 0,
              'math score': 0}
for score_name, avg in scores_avg.items():
    scores_avg[score_name] = df[score_name].mean()
    print(f'Average {score_name}: {scores_avg[score_name]}')


Average reading score: 69.169
Average writing score: 68.054
Average math score: 66.089


In [8]:
def above_below(num, avg):
    if num > avg:
        return 1
    return 0

classed_avg = pd.DataFrame(df)
for score_name, avg in scores_avg.items():
    classed_avg[score_name] = df[score_name].apply(lambda x: above_below(x,avg))
classed_avg


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,1,1,1
1,female,group C,some college,standard,completed,1,1,1
2,female,group B,master's degree,standard,none,1,1,1
3,male,group A,associate's degree,free/reduced,none,0,0,0
4,male,group C,some college,standard,none,1,1,1
5,female,group B,associate's degree,standard,none,1,1,1
6,female,group B,some college,standard,completed,1,1,1
7,male,group B,some college,free/reduced,none,0,0,0
8,male,group D,high school,free/reduced,completed,0,0,0
9,female,group B,high school,free/reduced,none,0,0,0


In [9]:
nominal_attributes = ['gender',
                      'race/ethnicity',
                      'lunch',
                      'test preparation course']

nominal_numer_map = {}
numer_nominal_map = {}
for attributes in nominal_attributes:   
    gle = LabelEncoder()
    genre_labels = gle.fit_transform(classed_avg[attributes])
    
    genre_mappings = {label: index for index, label in 
                      enumerate(gle.classes_)}
    nominal_numer_map[attributes] = genre_mappings
    
    reverse_genre_mappings = {index: label for index, label in 
                      enumerate(gle.classes_)}
    numer_nominal_map[attributes] = genre_mappings
    
for att, mapping in nominal_numer_map.items():
    print(f'{att}:\n {mapping}\n')


gender:
 {'female': 0, 'male': 1}

race/ethnicity:
 {'group A': 0, 'group B': 1, 'group C': 2, 'group D': 3, 'group E': 4}

lunch:
 {'free/reduced': 0, 'standard': 1}

test preparation course:
 {'completed': 0, 'none': 1}



In [10]:

def map_nominals(cat, mapping):
    return mapping[cat]
    
nominal_mapped = pd.DataFrame(classed_avg)
for att, mapping in nominal_numer_map.items():
    nominal_mapped[att] = classed_avg[att].apply(lambda x: map_nominals(x,mapping))
nominal_mapped
    

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,bachelor's degree,1,1,1,1,1
1,0,2,some college,1,0,1,1,1
2,0,1,master's degree,1,1,1,1,1
3,1,0,associate's degree,0,1,0,0,0
4,1,2,some college,1,1,1,1,1
5,0,1,associate's degree,1,1,1,1,1
6,0,1,some college,1,0,1,1,1
7,1,1,some college,0,1,0,0,0
8,1,3,high school,0,0,0,0,0
9,0,1,high school,0,1,0,0,0


In [14]:
education_map = {'some high school': 0,
                 "high school": 1,
                 'some college': 2,
                 "associate's degree": 3,
                 "bachelor's degree": 4,
                 "master's degree": 5,}
ordinal_mapped = pd.DataFrame(nominal_mapped)
def map_ordinals(cat, mapping):
    return mapping[cat]
for level, mapping in education_map.items():
    ordinal_mapped['parental level of education'] \
        = nominal_mapped['parental level of education'].\
        apply(lambda x: map_nominals(x, education_map))
nominal_mapped



Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,4,1,1,1,1,1
1,0,2,2,1,0,1,1,1
2,0,1,5,1,1,1,1,1
3,1,0,3,0,1,0,0,0
4,1,2,2,1,1,1,1,1
5,0,1,3,1,1,1,1,1
6,0,1,2,1,0,1,1,1
7,1,1,2,0,1,0,0,0
8,1,3,1,0,0,0,0,0
9,0,1,1,0,1,0,0,0


In [21]:

nominal_mapped.to_csv('dataset/processed.csv', index=False)

In [22]:
df = pd.read_csv('dataset/processed.csv')
df


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,4,1,1,1,1,1
1,0,2,2,1,0,1,1,1
2,0,1,5,1,1,1,1,1
3,1,0,3,0,1,0,0,0
4,1,2,2,1,1,1,1,1
5,0,1,3,1,1,1,1,1
6,0,1,2,1,0,1,1,1
7,1,1,2,0,1,0,0,0
8,1,3,1,0,0,0,0,0
9,0,1,1,0,1,0,0,0
