In [93]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [94]:

#read csv file
df = pd.read_csv('exams.csv')
 
#replacing values in gender category
df['gender'].replace(['male', 'female'],
                        [0, 1], inplace=True)

#replacing values in race/ethnicity category
df['race/ethnicity'].replace(['group A', 'group B', 'group C', 'group D', 'group E'],
                        [0, 1, 2, 3, 4], inplace=True)

#replacing values in parental level of education category
df['parental level of education'].replace(['some high school', 'high school', 'some college', "associate's degree", 
                                  "bachelor's degree", "master's degree"],
                        [0, 1, 2, 3, 4, 5], inplace=True)

#replacing values in lunch category
df['lunch'].replace(['standard', 'free/reduced'],
                        [0, 1], inplace=True)

#replacing values in test preparation course category
df['test preparation course'].replace(['completed', 'none'],
                        [0, 1], inplace=True)
Y = df[['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'] ]
X = df[['reading score', 'writing score', 'math score']]
m=np.size(Y, axis=0)

In [95]:
mu = X.mean()
std = X.std()

In [96]:
X=(X-mu)/std

In [97]:
#X.insert(0, "Intercepts", np.ones(m))
print(X)

     reading score  writing score  math score
0        -0.135846      -0.303699    0.039213
1        -0.678687      -0.816487   -1.713706
2        -0.610832      -1.136979   -0.480170
3         0.610561       0.016794    0.688443
4         0.271285       0.016794    0.753366
..             ...            ...         ...
995       0.067719      -0.175502    0.428751
996       1.492678       1.555158    1.207827
997      -2.307211      -1.713866   -2.233090
998       0.339140       0.914173    0.428751
999      -0.610832      -0.367797   -0.090632

[1000 rows x 3 columns]


In [98]:
theta_best = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(Y)
theta_best

array([[ 0.07215698, -0.15814861, -0.60453375,  0.19224924,  0.23738655],
       [ 0.52150988,  0.01035824,  0.86296016, -0.18490721, -0.45958361],
       [-0.57985704,  0.36261952,  0.14325263, -0.18694805,  0.10424608]])

In [99]:
grades = np.dot(theta_best,[1,1,1,0,0])# x values needed
print(grades)

[-0.69052538  1.39482828 -0.07398488]
