# Advanced Data Science
<p/>

### Data Preprocessing - Transformation (Categorization)

In [1]:
#Example of supress warnings for Numpy version out of range (optional)
import warnings
warnings.filterwarnings("ignore", category=Warning)

#Pull in the libraries we need
import numpy as np
import pandas as pd
from pandas import DataFrame

## Kaggle `Datasets`
---
We access the Kaggle Experience at: https://www.kaggle.com/

To get to the pre-loaded data, navigate to the Datasets menu to find:<br>
https://www.kaggle.com/datasets

Our preprocessing experiments today will be using the `Predict Pet Adoption Status` dataset

In [2]:
#First step is to find the data and gain an upderstanding 
#then dowload the .csv file and setup a variable for it
fileName = "pet_adoption_data.csv"

#Then use the pandas read_csv to load the data into a dataframe
petData = pd.read_csv(fileName)
petData.head(10)

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0
5,505,Dog,Labrador,70,Brown,Large,20.986261,0,0,87,301,1,0
6,506,Bird,Parakeet,169,Brown,Small,10.902613,1,0,70,440,1,0
7,507,Cat,Siamese,13,Orange,Large,7.252683,1,0,3,137,0,1
8,508,Bird,Parakeet,49,Brown,Medium,24.597598,1,1,69,405,0,0
9,509,Bird,Parakeet,60,Gray,Large,7.295994,0,0,73,231,1,0


## Categorization using .get_dummies

In [3]:
#We have four catagorical features (PetType, Breed, Color, Size)
#Example of how to use pandas .get_dummies()
petDataConverted = pd.get_dummies(petData, columns=['PetType'])
petDataConverted.head()

Unnamed: 0,PetID,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood,PetType_Bird,PetType_Cat,PetType_Dog,PetType_Rabbit
0,500,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0,True,False,False,False
1,501,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0,False,False,False,True
2,502,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0,False,False,True,False
3,503,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0,True,False,False,False
4,504,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0,False,False,False,True


## Categorization using .replace

In [4]:
#Notice a new column is created for each catagory (e.g., PetType_Bird, PetType_Cat ...)
#If you prefer to end up with one column that contains the new codes vs. several, use the .replace()
#Setup a deep copy just for demonstration purposes (copies indicies and data)
petDataConverted = petData.copy()
#print(petData.head())
petDataConverted['PetType'].replace(['Bird','Cat','Dog','Rabbit'],[0,1,2,3],inplace=True)
petDataConverted.head()

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,0,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,3,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,2,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,0,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,3,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0


## Categorization using .map

In [5]:
#Another example is using the .map() to associate numeric values for each type
#Setup a deep copy just for demonstration purposes (copies indicies and data)
petDataMap = petData.copy()
#print(petData.head())
petTypes = {'Bird' : 0, 'Cat' : 1, 'Dog' : 2, 'Rabbit' : 3}
petDataMap['PetType'] = petDataMap['PetType'].map(petTypes)
petDataMap.head()

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,0,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,3,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,2,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,0,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,3,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0
