<a href="https://colab.research.google.com/github/uday-routhu/week4/blob/master/Abalone_Pre_Preprocessing_Core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Abalone Pre-Preprocessing (Core):

* Author: Udayakumar Routhu

# Tasks:

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
import missingno
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
# Set pandas as the default output for sklearn
from sklearn import set_config

#Run the set_config function from sklearn to set the default transformation output to pandas.

In [2]:
set_config(transform_output='pandas')

In [3]:
# Load Ames Prep for ML
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1jfU2oFSfhX1ywUbqETExDJuztO95r3h6pbWAm7xpwNY/gviz/tq?tqx=out:csv&sheet=users')

# Perform basic EDA on the entire dataframe:

1.Check the data types and convert dtypes, if needed.

In [4]:
# Define features and target
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


*   Data types looks good

2.Column for duplicate rows and address them, if needed.

In [6]:
df.duplicated().sum()

0

* looks like there are no duplicates to drop

3.Check for null values and impute them if needed. (Impute them in a way that prevents data leakage!)

In [7]:
df.isna().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

* There are no null values to impute place holder

4.Check for inconsistent categories and fix them if needed.

In [8]:
data_types = df.dtypes
str_cols = data_types[data_types=='object'].index
for col in str_cols:
    print(f'- {col}:')
    print(df[col].value_counts(dropna=False))
    print("\n\n")
    print(df[col])

- sex:
M    1528
I    1342
F    1307
Name: sex, dtype: int64



0       M
1       M
2       F
3       M
4       I
       ..
4172    F
4173    M
4174    M
4175    F
4176    M
Name: sex, Length: 4177, dtype: object


* There are no inconsistent categories

5.Check for impossible numeric values and fix them, if needed,

In [9]:
df.describe().round(2)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.52,0.41,0.14,0.83,0.36,0.18,0.24,9.93
std,0.12,0.1,0.04,0.49,0.22,0.11,0.14,3.22
min,0.08,0.06,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.45,0.35,0.12,0.44,0.19,0.09,0.13,8.0
50%,0.55,0.42,0.14,0.8,0.34,0.17,0.23,9.0
75%,0.62,0.48,0.16,1.15,0.5,0.25,0.33,11.0
max,0.82,0.65,1.13,2.83,1.49,0.76,1.0,29.0


In [10]:
stats =  df.describe()
stats.loc[['mean','min','max']]

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


* There are no impossible numeric values

#Separate your data into the feature matrix (X) and the target vector (y)

In [12]:
# Define features (X) and target (y).
X = df.drop(columns='rings')
y = df['rings']

#Train/test split the data. Please use the random number 42 for consistency.


In [14]:
## Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
X_train.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
3823,F,0.615,0.455,0.135,1.059,0.4735,0.263,0.274
3956,F,0.515,0.395,0.14,0.686,0.281,0.1255,0.22
3623,M,0.66,0.53,0.175,1.583,0.7395,0.3505,0.405
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
2183,M,0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35


#Create a ColumnTransformer to preprocess the data.

#Create lists of column names for numeric and categorical columns.

In [15]:
numeric_cols = X_train.select_dtypes('number').columns
numeric_cols

Index(['length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
       'viscera_weight', 'shell_weight'],
      dtype='object')

In [17]:
categorical_cols = X_train.select_dtypes('object').columns
categorical_cols

Index(['sex'], dtype='object')

Create a StandardScaler for scaling numeric columns.

In [19]:
# PREPROCESSING PIPELINE FOR NUMERIC DATA
# Save list of column names
print("Numeric Columns:", numeric_cols)
# instantiate preprocessors
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
# Make a numeric preprocessing pipeline
num_pipe = make_pipeline(impute_median, scaler)
num_pipe

Numeric Columns: Index(['length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
       'viscera_weight', 'shell_weight'],
      dtype='object')


In [29]:
# Making a numeric tuple for ColumnTransformer
num_tuple = ('numeric', num_pipe, numeric_cols)
num_tuple

('numeric',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('standardscaler', StandardScaler())]),
 Index(['length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
        'viscera_weight', 'shell_weight'],
       dtype='object'))

Create a OneHotEncoder for one-hot encoding the categorical columns.

In [27]:
# PREPROCESSING PIPELINE FOR ONE-HOT-ENCODED DATA
# Save list of column names"
#ohe_cols = X_train.select_dtypes("object").columns
print("OneHotEncoder Columns:", categorical_cols)
# Instantiate the individual preprocessors
impute_na = SimpleImputer(strategy='constant', fill_value = "NA")
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Make pipeline with imputer and encoder
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
ohe_pipe

OneHotEncoder Columns: Index(['sex'], dtype='object')


Create a tuple for each transformer with the: name, the transformer object, and the list of columns.

In [28]:
# Making a ohe_tuple for ColumnTransformer
ohe_tuple = ('categorical', ohe_pipe, categorical_cols)
ohe_tuple

('categorical',
 Pipeline(steps=[('simpleimputer',
                  SimpleImputer(fill_value='NA', strategy='constant')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 Index(['sex'], dtype='object'))

Use the tuples to create a ColumnTransformer to preprocess the data.
Make sure to set verbose_feature_names_out to False!

In [30]:
# Instantiate with verbose_feature_names_out=False
col_transformer = ColumnTransformer([num_tuple, ohe_tuple],verbose_feature_names_out=False)

#Fit the ColumnTransformer on your training data.

In [31]:
col_transformer.fit(X_train)

#Transform the training and test data and save them as new variables, named appropriately (e.g., X_train_tf/X_train_processed, X_test_tf/X_test_processed.

In [33]:
# Transform the training data
X_train_processed = col_transformer.transform(X_train)

In [34]:
# Transform the testing data
X_test_processed = col_transformer.transform(X_test)

#Check the transformed training data:
  * Confirm that all columns are numeric dtypes.
  * Confirm that the original numeric features have been scaled.

In [35]:
X_train_processed.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M
3823,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
3956,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.51404,-0.145838,1.0,0.0,0.0
3623,1.127086,1.225326,0.81891,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
0,-0.59398,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
2183,-0.258163,-0.093914,0.35002,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0


In [36]:
X_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3132 entries, 3823 to 860
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   length          3132 non-null   float64
 1   diameter        3132 non-null   float64
 2   height          3132 non-null   float64
 3   whole_weight    3132 non-null   float64
 4   shucked_weight  3132 non-null   float64
 5   viscera_weight  3132 non-null   float64
 6   shell_weight    3132 non-null   float64
 7   sex_F           3132 non-null   float64
 8   sex_I           3132 non-null   float64
 9   sex_M           3132 non-null   float64
dtypes: float64(10)
memory usage: 269.2 KB


* all the columns have numeric features

In [37]:
X_test_processed.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M
866,0.665336,0.464226,0.467243,0.54801,0.263634,1.096216,0.606609,0.0,0.0,1.0
1483,0.539405,0.312006,0.232798,0.077896,0.111143,0.304812,0.033316,0.0,0.0,1.0
599,0.287541,0.362746,1.2878,0.298707,-0.256629,0.391729,0.678271,1.0,0.0,0.0
1702,0.9172,0.819406,0.701688,0.869559,0.790624,0.775995,1.000748,1.0,0.0,0.0
670,-0.426072,-0.246134,0.115576,-0.441061,-0.57058,-0.67415,-0.181669,0.0,0.0,1.0


* all the columns numeric features have been scaled

In [38]:
X_test_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1045 entries, 866 to 2428
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   length          1045 non-null   float64
 1   diameter        1045 non-null   float64
 2   height          1045 non-null   float64
 3   whole_weight    1045 non-null   float64
 4   shucked_weight  1045 non-null   float64
 5   viscera_weight  1045 non-null   float64
 6   shell_weight    1045 non-null   float64
 7   sex_F           1045 non-null   float64
 8   sex_I           1045 non-null   float64
 9   sex_M           1045 non-null   float64
dtypes: float64(10)
memory usage: 89.8 KB
