# HOW IT WORKS: Values will have a mean of 0 and standard deviation of 1
# WHEN TO USE: When values are scaled differently, important for alogorithms like lenear models, knn, k-means, gradient descent
# WHEN NOT TO USE: tree models (not scale sensitive), when interpretability is important, for sparse data (could be computationally expensive)
# SENSITIVE TO: Distribution of data

# - Ensures that all features contribute equally

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('autompg.csv')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
df.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger
397,31.0,4,119.0,82,2720,19.4,82,1,chevy s-10


In [5]:
df.shape

(398, 9)

In [9]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [10]:
X = df.iloc[:,[2,4]]
y = df.iloc[:,5]

In [11]:
X.head()

Unnamed: 0,displacement,weight
0,307.0,3504
1,350.0,3693
2,318.0,3436
3,304.0,3433
4,302.0,3449


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)

In [14]:
X_train.head()

Unnamed: 0,displacement,weight
378,105.0,2125
12,400.0,3761
302,105.0,2150
377,91.0,1970
211,168.0,3820


In [15]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
X_train.std()

1.0

In [16]:
scaler.mean_ # mean of each feature

array([ 189.83724832, 2951.84228188])

In [23]:
scaler.scale_ # vairance of each feature

array([102.26778174, 842.96466601])