# Repeat data preprocessing steps : Using NumPy arrays for Vers 1



## Importing the libraries

In [109]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

To make things simple, we usually structure the dataset so that the target variable column is the last column in the table

*  **X typically denotes the feature variables**, which are all the columns in the table except the last one
* **y typically denotes the single target variable**, which is the last column in the table

Here we additionally use the values property on X and y (which are originally Dataframes) to convert the data to Numpy arrays. 

Working with Numpy arrays instead of Pandas Dataframes is done for a variety of reasons:

* Performance considerations

* Avoiding Pandas Overhead

* Integration with libraries that require NumPy


In [110]:
dataset = pd.read_csv('sample-data-proprocessing-v1.csv')
X = dataset.iloc[  :  , :-1].values
y = dataset.iloc[  :  , -1].values

In [111]:
print("The feature variable columns in X are")
print (X)
print (type(X))

The feature variable columns in X are
[['Spain' 21.0 11000.0 120 3]
 ['France' 45.0 32000.0 330 7]
 ['Spain' 43.0 60000.0 510 15]
 ['France' 40.0 80000.0 910 8]
 ['Germany' 74.0 59000.0 520 5]
 ['Germany' nan 92000.0 800 500]
 ['France' 51.0 43000.0 420 6]
 ['France' 74.0 nan 720 8]
 ['France' 73.0 25000.0 930 15]
 ['Spain' 65.0 85000.0 410 13]
 ['Spain' 44.0 94000.0 620 12]
 ['Germany' 25.0 22000.0 -200 9]
 ['Germany' 75.0 52000.0 740 4]
 ['Spain' 34.0 15000.0 870 19]
 ['Germany' nan 54000.0 370 6]
 ['France' 48.0 31000.0 610 7]
 ['France' 58.0 80000.0 280 11]
 ['Germany' 32.0 56000.0 200000 8]
 ['Spain' 34.0 51000.0 330 900]
 ['France' 55.0 59000.0 630 5]
 ['Spain' 50.0 54000.0 340 10]
 ['Germany' 62.0 nan 680 7]
 ['France' 44.0 45000.0 900 5]
 ['France' 39.0 18000.0 480 14]
 ['Spain' 38.0 33000.0 600 20]
 ['France' 51.0 95000.0 250 14]
 ['Spain' 46.0 80000.0 250 13]
 ['France' 74.0 82000.0 320 14]
 ['Germany' 72.0 37000.0 450 11]
 ['Spain' 80.0 98000.0 1000 6]]
<class 'numpy.ndarray

In [112]:
print ("The target variable column values are : ")
print(y)
print (type(y))

The target variable column values are : 
['Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No'
 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No'
 'No' 'Yes' 'Yes' 'No']
<class 'numpy.ndarray'>


## Checking for missing data and perform imputation if necessary

We can impute the missing cells with either the **mean, median or mode of all the other values in that column**

In this case, since we are working with NumPy arrays rather than Pandas dataframes, we will use specific methods from sklearn that are designed for these array


In [113]:
from sklearn.impute import SimpleImputer

# There is no direct code or method to explicitly handle NaN detection in a NumPy array
# equivalent to isNull method of dataframe, so we just determine through visual inspection which 
# arrays to perform imputation on

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1: ]) # From the 2nd column until the end 
X[:, 1: ] = imputer.transform(X[:, 1: ])
print("Feature variables X after imputation:\n")
print (X)

Feature variables X after imputation:

[['Spain' 21.0 11000.0 120.0 3.0]
 ['France' 45.0 32000.0 330.0 7.0]
 ['Spain' 43.0 60000.0 510.0 15.0]
 ['France' 40.0 80000.0 910.0 8.0]
 ['Germany' 74.0 59000.0 520.0 5.0]
 ['Germany' 51.67857142857143 92000.0 800.0 500.0]
 ['France' 51.0 43000.0 420.0 6.0]
 ['France' 74.0 55107.142857142855 720.0 8.0]
 ['France' 73.0 25000.0 930.0 15.0]
 ['Spain' 65.0 85000.0 410.0 13.0]
 ['Spain' 44.0 94000.0 620.0 12.0]
 ['Germany' 25.0 22000.0 -200.0 9.0]
 ['Germany' 75.0 52000.0 740.0 4.0]
 ['Spain' 34.0 15000.0 870.0 19.0]
 ['Germany' 51.67857142857143 54000.0 370.0 6.0]
 ['France' 48.0 31000.0 610.0 7.0]
 ['France' 58.0 80000.0 280.0 11.0]
 ['Germany' 32.0 56000.0 200000.0 8.0]
 ['Spain' 34.0 51000.0 330.0 900.0]
 ['France' 55.0 59000.0 630.0 5.0]
 ['Spain' 50.0 54000.0 340.0 10.0]
 ['Germany' 62.0 55107.142857142855 680.0 7.0]
 ['France' 44.0 45000.0 900.0 5.0]
 ['France' 39.0 18000.0 480.0 14.0]
 ['Spain' 38.0 33000.0 600.0 20.0]
 ['France' 51.0 95000.

In [114]:
# At this point we will need to convert the 2D numpy array to a dataframe 
# to perform the function to detect outliers and impute them
# as the functionality is too complex to implement completely using Numpy methods
X_df = pd.DataFrame(X, columns=['Country', 'Age', 'Salary', 'Cost', 'Days'])
print (X_df)

    Country        Age        Salary      Cost   Days
0     Spain       21.0       11000.0     120.0    3.0
1    France       45.0       32000.0     330.0    7.0
2     Spain       43.0       60000.0     510.0   15.0
3    France       40.0       80000.0     910.0    8.0
4   Germany       74.0       59000.0     520.0    5.0
5   Germany  51.678571       92000.0     800.0  500.0
6    France       51.0       43000.0     420.0    6.0
7    France       74.0  55107.142857     720.0    8.0
8    France       73.0       25000.0     930.0   15.0
9     Spain       65.0       85000.0     410.0   13.0
10    Spain       44.0       94000.0     620.0   12.0
11  Germany       25.0       22000.0    -200.0    9.0
12  Germany       75.0       52000.0     740.0    4.0
13    Spain       34.0       15000.0     870.0   19.0
14  Germany  51.678571       54000.0     370.0    6.0
15   France       48.0       31000.0     610.0    7.0
16   France       58.0       80000.0     280.0   11.0
17  Germany       32.0      

## Function to detect outliers and impute them with mean of non-outliers

We will reuse the same **custom algorithms** that we used previously

In [115]:
def impute_outliers(df, column):
    # Calculate the first and third quartile
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Check if there are any outliers
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound))
    outlier_count = outliers.sum()
    
    if outlier_count > 0:

        print (f"Number of outliers detecting for column '{column}' is {outlier_count}")
        
        # Calculate mean of non-outlier values
        non_outlier_mean = int(df[~outliers][column].mean())
        
        # Replace outliers with the mean of non-outlier values
        df.loc[outliers, column] = non_outlier_mean
        print(f"Outliers detected and imputed in column '{column}' with mean value {non_outlier_mean}")
    else:
        print(f"No outliers detected in column '{column}'")



In [116]:
# Check and impute outliers for columns 'Cost' and 'Days'
impute_outliers(X_df, 'Cost')
impute_outliers(X_df, 'Days')

Number of outliers detecting for column 'Cost' is 1
Outliers detected and imputed in column 'Cost' with mean value 523
Number of outliers detecting for column 'Days' is 2
Outliers detected and imputed in column 'Days' with mean value 9


In [117]:
print ("Feature variables after detection and imputation of outliers")
print (X_df)

Feature variables after detection and imputation of outliers
    Country        Age        Salary    Cost  Days
0     Spain       21.0       11000.0   120.0   3.0
1    France       45.0       32000.0   330.0   7.0
2     Spain       43.0       60000.0   510.0  15.0
3    France       40.0       80000.0   910.0   8.0
4   Germany       74.0       59000.0   520.0   5.0
5   Germany  51.678571       92000.0   800.0     9
6    France       51.0       43000.0   420.0   6.0
7    France       74.0  55107.142857   720.0   8.0
8    France       73.0       25000.0   930.0  15.0
9     Spain       65.0       85000.0   410.0  13.0
10    Spain       44.0       94000.0   620.0  12.0
11  Germany       25.0       22000.0  -200.0   9.0
12  Germany       75.0       52000.0   740.0   4.0
13    Spain       34.0       15000.0   870.0  19.0
14  Germany  51.678571       54000.0   370.0   6.0
15   France       48.0       31000.0   610.0   7.0
16   France       58.0       80000.0   280.0  11.0
17  Germany       32.

In [118]:
# Now we can convert the Dataframe back to a 2d Numpy array for further processing
X = X_df.to_numpy()
print ("Feature variables in a 2d Numpy array")
print (X)


Feature variables in a 2d Numpy array
[['Spain' 21.0 11000.0 120.0 3.0]
 ['France' 45.0 32000.0 330.0 7.0]
 ['Spain' 43.0 60000.0 510.0 15.0]
 ['France' 40.0 80000.0 910.0 8.0]
 ['Germany' 74.0 59000.0 520.0 5.0]
 ['Germany' 51.67857142857143 92000.0 800.0 9]
 ['France' 51.0 43000.0 420.0 6.0]
 ['France' 74.0 55107.142857142855 720.0 8.0]
 ['France' 73.0 25000.0 930.0 15.0]
 ['Spain' 65.0 85000.0 410.0 13.0]
 ['Spain' 44.0 94000.0 620.0 12.0]
 ['Germany' 25.0 22000.0 -200.0 9.0]
 ['Germany' 75.0 52000.0 740.0 4.0]
 ['Spain' 34.0 15000.0 870.0 19.0]
 ['Germany' 51.67857142857143 54000.0 370.0 6.0]
 ['France' 48.0 31000.0 610.0 7.0]
 ['France' 58.0 80000.0 280.0 11.0]
 ['Germany' 32.0 56000.0 523 8.0]
 ['Spain' 34.0 51000.0 330.0 9]
 ['France' 55.0 59000.0 630.0 5.0]
 ['Spain' 50.0 54000.0 340.0 10.0]
 ['Germany' 62.0 55107.142857142855 680.0 7.0]
 ['France' 44.0 45000.0 900.0 5.0]
 ['France' 39.0 18000.0 480.0 14.0]
 ['Spain' 38.0 33000.0 600.0 20.0]
 ['France' 51.0 95000.0 250.0 14.0]


## Perform one hot encoding on categorical variables in dataset

Here, we will again use methods from the sklearn module specific for this purpose 
as we will not be using the Pandas get_dummies method

In [119]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [120]:
print ("Feature variables after one hot encoding on the Country column")
print(X)

Feature variables after one hot encoding on the Country column
[[0.0 0.0 1.0 21.0 11000.0 120.0 3.0]
 [1.0 0.0 0.0 45.0 32000.0 330.0 7.0]
 [0.0 0.0 1.0 43.0 60000.0 510.0 15.0]
 [1.0 0.0 0.0 40.0 80000.0 910.0 8.0]
 [0.0 1.0 0.0 74.0 59000.0 520.0 5.0]
 [0.0 1.0 0.0 51.67857142857143 92000.0 800.0 9]
 [1.0 0.0 0.0 51.0 43000.0 420.0 6.0]
 [1.0 0.0 0.0 74.0 55107.142857142855 720.0 8.0]
 [1.0 0.0 0.0 73.0 25000.0 930.0 15.0]
 [0.0 0.0 1.0 65.0 85000.0 410.0 13.0]
 [0.0 0.0 1.0 44.0 94000.0 620.0 12.0]
 [0.0 1.0 0.0 25.0 22000.0 -200.0 9.0]
 [0.0 1.0 0.0 75.0 52000.0 740.0 4.0]
 [0.0 0.0 1.0 34.0 15000.0 870.0 19.0]
 [0.0 1.0 0.0 51.67857142857143 54000.0 370.0 6.0]
 [1.0 0.0 0.0 48.0 31000.0 610.0 7.0]
 [1.0 0.0 0.0 58.0 80000.0 280.0 11.0]
 [0.0 1.0 0.0 32.0 56000.0 523 8.0]
 [0.0 0.0 1.0 34.0 51000.0 330.0 9]
 [1.0 0.0 0.0 55.0 59000.0 630.0 5.0]
 [0.0 0.0 1.0 50.0 54000.0 340.0 10.0]
 [0.0 1.0 0.0 62.0 55107.142857142855 680.0 7.0]
 [1.0 0.0 0.0 44.0 45000.0 900.0 5.0]
 [1.0 0.0 0.0

## Perform label encoding on the target variable

For this case, we will use the LabelEncoder from sklearn which is also what we did for the case when y was a Pandas series object

In [121]:
print ("Target variable column original values")
print (y)

Target variable column original values
['Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No'
 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No'
 'No' 'Yes' 'Yes' 'No']


In [122]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print ("After encoding the Yes and No as 1 and 0")
print (y)

After encoding the Yes and No as 1 and 0
[1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0]


## Splitting original dataset into the Training set and Test set

The code here is exactly identical as for the case when X and y are Dataframes and Series objects

The train_test_split method is designed to accept Numpy arrays as well as DataFrames and Series objects

In [123]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3333, random_state = 1)

In [124]:
print (f"There are {len(X_train)} rows in the training dataset\n")
print ("The feature variable values are")
print (X_train)

print ("\nThe target variable values are")
print (y_train)

There are 20 rows in the training dataset

The feature variable values are
[[1.0 0.0 0.0 39.0 18000.0 480.0 14.0]
 [0.0 1.0 0.0 74.0 59000.0 520.0 5.0]
 [0.0 0.0 1.0 43.0 60000.0 510.0 15.0]
 [1.0 0.0 0.0 51.0 95000.0 250.0 14.0]
 [1.0 0.0 0.0 51.0 43000.0 420.0 6.0]
 [0.0 0.0 1.0 34.0 51000.0 330.0 9]
 [0.0 0.0 1.0 34.0 15000.0 870.0 19.0]
 [1.0 0.0 0.0 74.0 55107.142857142855 720.0 8.0]
 [1.0 0.0 0.0 74.0 82000.0 320.0 14.0]
 [1.0 0.0 0.0 45.0 32000.0 330.0 7.0]
 [1.0 0.0 0.0 58.0 80000.0 280.0 11.0]
 [0.0 0.0 1.0 21.0 11000.0 120.0 3.0]
 [1.0 0.0 0.0 48.0 31000.0 610.0 7.0]
 [0.0 0.0 1.0 80.0 98000.0 1000.0 6.0]
 [0.0 1.0 0.0 72.0 37000.0 450.0 11.0]
 [0.0 0.0 1.0 65.0 85000.0 410.0 13.0]
 [1.0 0.0 0.0 73.0 25000.0 930.0 15.0]
 [0.0 1.0 0.0 75.0 52000.0 740.0 4.0]
 [0.0 1.0 0.0 25.0 22000.0 -200.0 9.0]
 [0.0 1.0 0.0 51.67857142857143 92000.0 800.0 9]]

The target variable values are
[0 1 1 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 1]


In [125]:
print (f"There are {len(X_test)} rows in the test dataset\n")
print ("The feature variable values are")
print (X_test)

print ("\nThe target variable values are")
print (y_test)

There are 10 rows in the test dataset

The feature variable values are
[[0.0 1.0 0.0 32.0 56000.0 523 8.0]
 [0.0 1.0 0.0 62.0 55107.142857142855 680.0 7.0]
 [0.0 0.0 1.0 44.0 94000.0 620.0 12.0]
 [1.0 0.0 0.0 55.0 59000.0 630.0 5.0]
 [0.0 1.0 0.0 51.67857142857143 54000.0 370.0 6.0]
 [0.0 0.0 1.0 50.0 54000.0 340.0 10.0]
 [0.0 0.0 1.0 46.0 80000.0 250.0 13.0]
 [1.0 0.0 0.0 40.0 80000.0 910.0 8.0]
 [0.0 0.0 1.0 38.0 33000.0 600.0 20.0]
 [1.0 0.0 0.0 44.0 45000.0 900.0 5.0]]

The target variable values are
[1 1 1 0 0 0 0 0 1 1]


## Feature Scaling

Here we also use the same objects StandardScaler from sklearn, and the code is nearly identical



In [126]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# Fit and transform the training data
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# Transform the test data
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [127]:
print ("Feature variables in training dataset after standardization")
print (X_train)

Feature variables in training dataset after standardization
[[1.0 0.0 0.0 -0.8632009950207814 -1.2291519650612563
  -0.05074934819224496 0.9586637990406123]
 [0.0 1.0 0.0 1.100668941416516 0.24631878925496908 0.08924885371739631
  -1.1717001988274145]
 [0.0 0.0 1.0 -0.6387587165708045 0.2823058808236575 0.054249303239985995
  1.1953709099148375]
 [1.0 0.0 0.0 -0.18987415967085083 1.5418540857277523 -0.8557390091726823
  0.9586637990406123]
 [1.0 0.0 0.0 -0.18987415967085083 -0.3294746758440457
  -0.26074665105670686 -0.9349930879531894]
 [0.0 0.0 1.0 -1.1437538430832523 -0.04157794329453831
  -0.5757426053533997 -0.2248717553305138]
 [0.0 0.0 1.0 -1.1437538430832523 -1.3371132397673215 1.3142331204267574
  2.1421993534117383]
 [1.0 0.0 0.0 1.100668941416516 0.10622618279114622 0.7892398632656027
  -0.461578866204739]
 [1.0 0.0 0.0 1.100668941416516 1.0740218953348029 -0.61074215583081
  0.9586637990406123]
 [1.0 0.0 0.0 -0.5265375773458161 -0.7253326830996183 -0.5757426053533997
  -0.6

In [128]:
print ("Feature variables in test dataset after standardization")
print (X_test)

Feature variables in test dataset after standardization
[[0.0 1.0 0.0 -1.255974982308241 0.1383575145489038 0.09974871886061941
  -0.461578866204739]
 [0.0 1.0 0.0 0.4273421060665855 0.10622618279114622 0.6492416613559614
  -0.6982859770789642]
 [0.0 0.0 1.0 -0.5826481469583104 1.5058669941590639 0.4392443584914995
  0.48524957729216184]
 [1.0 0.0 0.0 0.034568118779126016 0.24631878925496908 0.4742439089689098
  -1.1717001988274145]
 [0.0 1.0 0.0 -0.15179913029094394 0.06638333141152697
  -0.4357444034437585 -0.9349930879531894]
 [0.0 0.0 1.0 -0.24598472928334505 0.06638333141152697
  -0.5407430548759894 0.011835355543711429]
 [0.0 0.0 1.0 -0.4704270077333219 1.002047712197426 -0.8557390091726823
  0.721956688166387]
 [1.0 0.0 0.0 -0.8070904254082871 1.002047712197426 1.4542313223363987
  -0.461578866204739]
 [0.0 0.0 1.0 -0.9193115646332756 -0.6893455915309299 0.36924525753667886
  2.3789064642859636]
 [1.0 0.0 0.0 -0.5826481469583104 -0.25750049270666886 1.4192317718589884
  -1.17170