# Normalize Data

https://www.digitalocean.com/community/tutorials/normalize-data-in-python


In [1]:
from sklearn import preprocessing
import numpy as np

x_array = np.array([2,3,5,6,7,4,8,7,6])
normalized_arr = preprocessing.normalize([x_array])
print(normalized_arr)


[[0.11785113 0.1767767  0.29462783 0.35355339 0.41247896 0.23570226
  0.47140452 0.41247896 0.35355339]]


# Normalizing Columns from a DataFrame Using the normalize() Function

In [8]:
import numpy as np
from sklearn import preprocessing
from sklearn.datasets import fetch_california_housing

# create the DataFrame
california_housing = fetch_california_housing(as_frame=True)

print(california_housing.data)

# print the dataset description
print(california_housing.DESCR)


       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [9]:
x_array = np.array(california_housing.data['HouseAge'])
print("HouseAge array: ",x_array)


HouseAge array:  [41. 21. 52. ... 17. 18. 16.]


In [10]:
normalized_arr = preprocessing.normalize([x_array])
print("Normalized HouseAge array: ",normalized_arr)


Normalized HouseAge array:  [[0.00912272 0.00467261 0.01157028 ... 0.00378259 0.0040051  0.00356009]]


# Normalizing Datasets by Row or by Column Using the normalize() Function


In [11]:
from sklearn import preprocessing
import pandas as pd

from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True)

d = preprocessing.normalize(california_housing.data)
scaled_df = pd.DataFrame(d, columns=california_housing.data.columns)
print(scaled_df)


         MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  \
0      0.023848  0.117447  0.020007   0.002933    0.922391  0.007321   
1      0.003452  0.008734  0.002594   0.000404    0.998535  0.000877   
2      0.014092  0.100971  0.016093   0.002084    0.963106  0.005441   
3      0.009816  0.090449  0.010119   0.001866    0.970590  0.004432   
4      0.006612  0.089394  0.010799   0.001859    0.971303  0.003750   
...         ...       ...       ...        ...         ...       ...   
20635  0.001825  0.029242  0.005902   0.001326    0.988384  0.002995   
20636  0.006753  0.047539  0.016147   0.003475    0.940212  0.008247   
20637  0.001675  0.016746  0.005128   0.001103    0.991926  0.002291   
20638  0.002483  0.023932  0.007086   0.001558    0.985188  0.002823   
20639  0.001715  0.011486  0.003772   0.000834    0.995727  0.001879   

       Latitude  Longitude  
0      0.108510  -0.350136  
1      0.015745  -0.050829  
2      0.073495  -0.237359  
3      0.065837  -0

In [12]:
from sklearn import preprocessing
import pandas as pd

from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True)

d = preprocessing.normalize(california_housing.data, axis=0)
scaled_df = pd.DataFrame(d, columns=california_housing.data.columns)
print(scaled_df)


         MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  \
0      0.013440  0.009123  0.008148   0.005965    0.001231  0.001642   
1      0.013401  0.004673  0.007278   0.005662    0.009180  0.001356   
2      0.011716  0.011570  0.009670   0.006254    0.001896  0.001801   
3      0.009110  0.011570  0.006787   0.006252    0.002133  0.001638   
4      0.006209  0.011570  0.007329   0.006299    0.002160  0.001402   
...         ...       ...       ...        ...         ...       ...   
20635  0.002519  0.005563  0.005886   0.006603    0.003231  0.001646   
20636  0.004128  0.004005  0.007133   0.007666    0.001361  0.002007   
20637  0.002744  0.003783  0.006073   0.006526    0.003850  0.001495   
20638  0.003014  0.004005  0.006218   0.006828    0.002833  0.001365   
20639  0.003856  0.003560  0.006131   0.006772    0.005303  0.001682   

       Latitude  Longitude  
0      0.007386  -0.007114  
1      0.007383  -0.007114  
2      0.007381  -0.007115  
3      0.007381  -0

# Using the scikit-learn preprocessing.MinMaxScaler() Function to Normalize Data

You can use the scikit-learn preprocessing.MinMaxScaler() function to normalize each feature by scaling the data to a range.

The MinMaxScaler() function scales each feature individually so that the values have a given minimum and maximum value, with a default of 0 and 1.
The formula to scale feature values to between 0 and 1 is:

formula for feature scaling
Subtract the minimum value from each entry and then divide the result by the range, where range is the difference between the maximum value and the minimum value.

The following example demonstrates how to use the MinMaxScaler() function to normalize the California Housing dataset:


In [13]:
from sklearn import preprocessing
import pandas as pd

from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True)

scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(california_housing.data)
scaled_df = pd.DataFrame(d, columns=california_housing.data.columns)
print(scaled_df)


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
         MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  \
0      0.539668  0.784314  0.043512   0.020469    0.008941  0.001499   
1      0.538027  0.392157  0.038224   0.018929    0.067210  0.001141   
2      0.466028  1.000000  0.052756   0.021940    0.013818  0.001698   
3      0.354699  1.000000  0.035241   0.021929    0.015555  0.001493   
4      0.230776  1.000000  0.038534   0.022166    0.015752  0.001198   
...         ...       ...       ...        ...         ...       ...   
20635  0.073130  0.470588  0.029769   0.023715    0.023599  0.001503   
20636  0.141853  0.333333  0.037344   0.029124    0.009894  0.001956   
20637  0.082764  0.313725  0.030904   0.023323    0.028140  0.001314   
20638  0.094295  0.333333  0.031783   0.024859    0.020684  0.001152   
20639  0.130253  0.294118  0.031252   0.024573    0.03

In [14]:
from sklearn import preprocessing
import pandas as pd

from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True)

scaler = preprocessing.MinMaxScaler(feature_range=(0, 2))
d = scaler.fit_transform(california_housing.data)
scaled_df = pd.DataFrame(d, columns=california_housing.data.columns)
print(scaled_df)


         MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  \
0      1.079337  1.568627  0.087025   0.040937    0.017882  0.002999   
1      1.076054  0.784314  0.076448   0.037859    0.134421  0.002281   
2      0.932056  2.000000  0.105513   0.043880    0.027635  0.003396   
3      0.709397  2.000000  0.070482   0.043857    0.031111  0.002987   
4      0.461552  2.000000  0.077068   0.044333    0.031503  0.002397   
...         ...       ...       ...        ...         ...       ...   
20635  0.146260  0.941176  0.059538   0.047431    0.047199  0.003007   
20636  0.283706  0.666667  0.074688   0.058248    0.019788  0.003912   
20637  0.165529  0.627451  0.061808   0.046646    0.056280  0.002629   
20638  0.188591  0.666667  0.063565   0.049719    0.041369  0.002303   
20639  0.260507  0.588235  0.062505   0.049146    0.077581  0.003098   

       Latitude  Longitude  
0      1.134963   0.422311  
1      1.130712   0.424303  
2      1.128587   0.420319  
3      1.128587   0