# Data Preparation 1

### Import libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

### Setup dataframe

In [2]:
# Initialize list for dataframe
data = [['Adam','male','Ph.D','NY','M', 'PT'],['Dina','female','M.S.','NY','D','PT'], 
        ['John','male','B.S.','MI','S','FT'],['Elton','male','H.S.','AL','M', 'Unemployed'],
        ['Gina','female','H.S.','AK','D', 'Intern'],['Sara', 'female','M.S.','AZ','S','FT'], 
        ['Lana','female','B.S.', 'CA','M','Intern'],['Tim','male','Ph.D','AR','D', 'PT'],
        ['Suzan','female','M.S.','AZ','S','FT'], ['Aj','male','B.S.','CA','M','FT'],
        ['Luke','male','H.S.','CA','D', 'Unemployed'],['Salma','female','H.S.','CA','S', 'Intern'],
        ['Abby', 'female','M.S.','CA','M','PT'], ['Sandra','female','H.S.','CO','D', 'PT'],
        ['Zen','male','Ph.D','CO','S', 'PT'],['Emma','female','M.S.','DC','M','FT'], 
        ['Owen','male','B.S.','DE','D','Unemployed'],['Levi','male','H.S.','CT','S','Unemployed'],
        ['Eva','female','H.S.','DC','M', 'PT'],['Sophia', 'female','M.S.','DC','D','PT'], 
        ['Ella','female','H.S.','FL','S', 'Unemployed'],['Jacob','male','Ph.D','IL','M', 'FT'],
        ['Mia','female','M.S.','FL','D','PT'], ['Ethan','male','B.S.','FL','S','PT'],
        ['Luke','male','H.S.','IL','M', 'Intern'],['Nadia','female','H.S.','IN','D', 'Intern'],
        ['Tulip', 'female','M.S.','IA','S','PT'],['Linda','female','H.S.','KS','M', 'PT'],
        ['Jack','male','Ph.D','KI','D', 'PT'],['Kindra','female','M.S.','LA','S','FT'],
        ['Manson','male','B.S.','ME','M','Unemployed'],['Logan','male','H.S.','MD','D', 'Unemployed'],
        ['Luna','female','H.S.','MA','S', 'Unemployed'],['Layla', 'female','M.S.','MN','M','Unemployed'],
        ['Lynn','female','H.S.','MS','D', 'PT'],['Nora','male','Ph.D','MS','S', 'FT'],
        ['Suzy','female','M.S.','MO','M','PT'], ['Lucas','male','B.S.','MT','D','PT'],
        ['Chad','male','H.S.','TX','S', 'Unemployed'], ['Soleen','female','H.S.','TX','M','Unemployed'],
        ['Zoey', 'female','M.S.','NY','D','FT'], ['Ellie','female','H.S.','TX','S', 'Unemployed'],
        ['Oliver','male','Ph.D','TX','M','PT'],['Tamara','female','M.S.','NJ','D','PT'], 
        ['Noah','male','B.S.','NJ','S','FT'], ['Liam','male','H.S.','MI','M', 'Unemployed'],
        ['Stella','female','H.S.','NM','D', 'Unemployed'],['Audrey', 'female','M.S.','NC','S','PT'], 
        ['Lucy','female','H.S.','ND','M', 'PT'],['Timmy','male','Ph.D','OH','D', 'PT'],
        ['Bella','female','M.S.','OH','S','PT'], ['William','male','B.S.','OK','M','FT'],
        ['Brian','male','H.S.','OK','D', 'Unemployed'],['Nova','female','H.S.','OR','S', 'Unemployed'],
        ['AnnaAbby', 'female','M.S.','MI','M','FT'], ['Elena','female','H.S.','NY','D', 'Intern']]

In [3]:
# Create dataframe
df = pd.DataFrame(data, columns = ['Name', 'Gender','Education','State','Marital Status','Employment Status'])

### Examine data

In [4]:
df.shape

(56, 6)

In [5]:
df.head()

Unnamed: 0,Name,Gender,Education,State,Marital Status,Employment Status
0,Adam,male,Ph.D,NY,M,PT
1,Dina,female,M.S.,NY,D,PT
2,John,male,B.S.,MI,S,FT
3,Elton,male,H.S.,AL,M,Unemployed
4,Gina,female,H.S.,AK,D,Intern


In [6]:
# Save a copy of the original dataframe
df_orig = df.copy()

### Prepare data

In [7]:
# Drop unnecessary columns
df.drop(['Name'], axis=1, inplace=True)
df.head()

Unnamed: 0,Gender,Education,State,Marital Status,Employment Status
0,male,Ph.D,NY,M,PT
1,female,M.S.,NY,D,PT
2,male,B.S.,MI,S,FT
3,male,H.S.,AL,M,Unemployed
4,female,H.S.,AK,D,Intern


### Ordinal encoder

In [8]:
# List unique values for Education
df['Education'].unique()

array(['Ph.D', 'M.S.', 'B.S.', 'H.S.'], dtype=object)

In [9]:
# Encode Education column
oe = OrdinalEncoder(categories=[['H.S.', 'B.S.', 'M.S.','Ph.D']])     # Arrange values according to relative value
df['Education'] = oe.fit_transform(df[['Education']])
df.head()

Unnamed: 0,Gender,Education,State,Marital Status,Employment Status
0,male,3.0,NY,M,PT
1,female,2.0,NY,D,PT
2,male,1.0,MI,S,FT
3,male,0.0,AL,M,Unemployed
4,female,0.0,AK,D,Intern


### Dummy encoding (one hot encoding)

In [10]:
# List unique values for Gender
df['Gender'].unique()

array(['male', 'female'], dtype=object)

In [11]:
# Generate encodings for Gender
df_ohe1 = pd.get_dummies(df['Gender'])
df_ohe1.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,0,1
3,0,1
4,1,0


In [12]:
# Join dataframe with Gender-encoded dataframe
df2 = df.join(df_ohe1)
df2.head()

Unnamed: 0,Gender,Education,State,Marital Status,Employment Status,female,male
0,male,3.0,NY,M,PT,0,1
1,female,2.0,NY,D,PT,1,0
2,male,1.0,MI,S,FT,0,1
3,male,0.0,AL,M,Unemployed,0,1
4,female,0.0,AK,D,Intern,1,0


In [13]:
# Drop Gender column since it is now encoded
df2.drop(['Gender'], axis=1, inplace=True)
df2.head()

Unnamed: 0,Education,State,Marital Status,Employment Status,female,male
0,3.0,NY,M,PT,0,1
1,2.0,NY,D,PT,1,0
2,1.0,MI,S,FT,0,1
3,0.0,AL,M,Unemployed,0,1
4,0.0,AK,D,Intern,1,0


In [14]:
# List unique values for Marital Status
df2['Marital Status'].unique()

array(['M', 'D', 'S'], dtype=object)

In [15]:
# More efficient way to perform one hot encoding
df2 = pd.get_dummies(df2, prefix=['Marital Status'], columns = ['Marital Status'])
df2.head()

Unnamed: 0,Education,State,Employment Status,female,male,Marital Status_D,Marital Status_M,Marital Status_S
0,3.0,NY,PT,0,1,0,1,0
1,2.0,NY,PT,1,0,1,0,0
2,1.0,MI,FT,0,1,0,0,1
3,0.0,AL,Unemployed,0,1,0,1,0
4,0.0,AK,Intern,1,0,1,0,0


### Label encoding

In [16]:
# Encode Employment Status column
le = LabelEncoder()
df2['Employment Status']= le.fit_transform(df2['Employment Status']) 
df2.head()

Unnamed: 0,Education,State,Employment Status,female,male,Marital Status_D,Marital Status_M,Marital Status_S
0,3.0,NY,2,0,1,0,1,0
1,2.0,NY,2,1,0,1,0,0
2,1.0,MI,0,0,1,0,0,1
3,0.0,AL,3,0,1,0,1,0
4,0.0,AK,1,1,0,1,0,0


#### Note: We can decide to either drop State column or one-hot encode it

In [17]:
# Compare with original dataframe
df_orig.head()

Unnamed: 0,Name,Gender,Education,State,Marital Status,Employment Status
0,Adam,male,Ph.D,NY,M,PT
1,Dina,female,M.S.,NY,D,PT
2,John,male,B.S.,MI,S,FT
3,Elton,male,H.S.,AL,M,Unemployed
4,Gina,female,H.S.,AK,D,Intern
