In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# read data
data = pd.read_csv(r"/Users/lejohn/Documents/APU/Deep_Learning/Data/boston_pre.csv")

In [3]:
# print first 5 rows
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,PRICE2
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,


In [4]:
# basic information on the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     503 non-null    float64
 1   ZN       505 non-null    float64
 2   INDUS    492 non-null    float64
 3   CHAS     504 non-null    float64
 4   NOX      501 non-null    float64
 5   RM       500 non-null    float64
 6   AGE      498 non-null    float64
 7   DIS      502 non-null    float64
 8   RAD      497 non-null    float64
 9   TAX      500 non-null    float64
 10  PTRATIO  502 non-null    float64
 11  B        501 non-null    float64
 12  LSTAT    504 non-null    float64
 13  PRICE    505 non-null    float64
 14  PRICE2   0 non-null      float64
dtypes: float64(15)
memory usage: 59.4 KB


In [5]:
# size of data
data.shape

(506, 15)

In [6]:
# statistical summart of the data
data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,PRICE2
count,503.0,505.0,492.0,504.0,501.0,500.0,498.0,502.0,497.0,500.0,502.0,501.0,504.0,505.0,0.0
mean,3.612897,11.361386,11.034858,0.069444,0.555109,6.280806,68.281928,3.793816,9.61167,409.164,18.468327,356.340938,12.623254,22.544752,
std,8.618816,23.345524,6.869158,0.254461,0.116275,0.699794,28.156459,2.109926,8.771293,169.12958,2.165685,91.684008,7.112884,9.202293,
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0,
25%,0.08193,0.0,5.13,0.0,0.449,5.8865,44.55,2.09115,4.0,279.0,17.4,375.33,6.9275,17.1,
50%,0.25915,0.0,9.125,0.0,0.538,6.2085,76.8,3.20745,5.0,330.0,19.1,391.43,11.36,21.2,
75%,3.675945,12.5,18.1,0.0,0.624,6.61825,93.9,5.188425,24.0,666.0,20.2,396.21,16.945,25.0,
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0,


##### Here we can see that some columns have a way higher max value compared to their 75th percentile, indicating there are some outliers. This will affect the imputed values. Hence, using the median values to impute will be better as it is less senstivie to extreme values.

In [7]:
# drop unnessary values and columns
data.drop(["PRICE2"], axis=1, inplace=True)
data.dropna(how="all", inplace=True)

In [8]:
# check for more missing values
data.isnull().sum()

CRIM        2
ZN          0
INDUS      13
CHAS        1
NOX         4
RM          5
AGE         7
DIS         3
RAD         8
TAX         5
PTRATIO     3
B           4
LSTAT       1
PRICE       0
dtype: int64

In [9]:
# handle the missing values by replacing with the median values
imputer = SimpleImputer(strategy='median')
imputed_data = imputer.fit_transform(data)
data_cleaned = pd.DataFrame(imputed_data, columns=data.columns)

In [10]:
# set feature and target variables
X = data_cleaned.drop("PRICE", axis=1)
y = data_cleaned["PRICE"]

In [11]:
# scale feature variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1)

In [13]:
# print the shape of each split data
split_data_name = ["X_train", "X_test", "y_train", "y_test"]
split_data = [X_train, X_test, y_train, y_test]

for i in range(4):
    print("Shape of {}: {}".format(str(split_data_name[i]), split_data[i].shape))

Shape of X_train: (404, 13)
Shape of X_test: (101, 13)
Shape of y_train: (404,)
Shape of y_test: (101,)
