In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

visa_df = pd.read_csv(r'C:\Users\saina\Documents\DataScience\Datafiles\Visadataset.csv')

cat_columns = visa_df.select_dtypes(include='object').columns
num_columns = visa_df.select_dtypes(exclude='object').columns
cat_columns, num_columns

(Index(['case_id', 'continent', 'education_of_employee', 'has_job_experience',
        'requires_job_training', 'region_of_employment', 'unit_of_wage',
        'full_time_position', 'case_status'],
       dtype='object'),
 Index(['no_of_employees', 'yr_of_estab', 'prevailing_wage'], dtype='object'))

**Encoding**

- Encoding means convert categorical columns data to numerical data
- Because ML models expect the data should be in numercial format
- ML models developed by math algorithms
- It is very very important to apply encoding techniques
- The following methods are
    - map
    - np.where
    - one hot encoding
    - label encoder

**map**

- map is one method to convert categorical values to numerical
- Take one categorical column
- Get the unique labels first
- Make a dictionary by assigning a number to each lable
- For example, case_status has two lables
    - Certified
    - Denied
- Assign 0 to Certified and 1 to Denied
- Create a dictionary lables as keys, and numbers as values
- {'Certified':0,'Denied':1}

In [2]:
# Step-1: read the column
# Step-2: get the unique lables
# Step-3: make a dictionary
# Step-4: apply the mapping

visa_df['case_status'].unique()

array(['Denied', 'Certified'], dtype=object)

In [3]:
d = {'Certified':0,'Denied':1}
visa_df['case_status_new'] = visa_df['case_status'].map(d)
visa_df

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,case_status_new
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,1
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,Certified,0
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,Denied,1
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,Denied,1
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,Certified,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,Certified,0
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,Certified,0
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.8500,Year,N,Certified,0
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,Certified,0


In [None]:
# I want to apply map method for all categorical columns
# Step-1: We need to get unique values
# Step-2: We need to get a number = len unique lables
# Step-3: We need to create a dictionary

In [5]:
unique_lables = visa_df['case_status'].unique()
list1 = [i for i in range(len(unique_labels))]
unique_lables, list1

(array(['Denied', 'Certified'], dtype=object), [0, 1])

In [6]:
sorted(unique_lables)

['Certified', 'Denied']

In [13]:
unique_lables = sorted(visa_df['case_status'].unique())
list1 = [i for i in range(len(unique_labels))]
unique_lables, list1

(['Certified', 'Denied'], [0, 1])

In [12]:
dict1 = {}
dict1['Certified'] = 0
dict1['Denied'] = 1
dict1

{'Certified': 0, 'Denied': 11}

In [15]:
dict1 = dict(zip(unique_lables, list1))
dict1

{'Certified': 0, 'Denied': 1}

In [16]:
{key:value for key, value in zip(unique_lables, list1)}

{'Certified': 0, 'Denied': 1}

In [17]:
visa_df['case_status'] = visa_df['case_status'].map(dict1)
visa_df

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,case_status_new
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,1,1
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,0,0
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,1,1
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,1,1
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,0,0
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,0,0
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.8500,Year,N,0,0
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,0,0


In [19]:
for i in cat_columns[1:]:
    print(i)

continent
education_of_employee
has_job_experience
requires_job_training
region_of_employment
unit_of_wage
full_time_position
case_status


In [26]:
for i in cat_columns[1:]:
    unique_lables = sorted(visa_df[i].unique())
    list1 = [i for i in range(len(unique_lables))]
    print(unique_lables)
    print(list1)
    dict1 = dict(zip(unique_lables, list1))
    print(dict1)
    visa_df[i] = visa_df[i].map(dict1)
visa_df

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']
[0, 1, 2, 3, 4, 5]
{'Africa': 0, 'Asia': 1, 'Europe': 2, 'North America': 3, 'Oceania': 4, 'South America': 5}
["Bachelor's", 'Doctorate', 'High School', "Master's"]
[0, 1, 2, 3]
{"Bachelor's": 0, 'Doctorate': 1, 'High School': 2, "Master's": 3}
['N', 'Y']
[0, 1]
{'N': 0, 'Y': 1}
['N', 'Y']
[0, 1]
{'N': 0, 'Y': 1}
['Island', 'Midwest', 'Northeast', 'South', 'West']
[0, 1, 2, 3, 4]
{'Island': 0, 'Midwest': 1, 'Northeast': 2, 'South': 3, 'West': 4}
['Hour', 'Month', 'Week', 'Year']
[0, 1, 2, 3]
{'Hour': 0, 'Month': 1, 'Week': 2, 'Year': 3}
['N', 'Y']
[0, 1]
{'N': 0, 'Y': 1}
['Certified', 'Denied']
[0, 1]
{'Certified': 0, 'Denied': 1}


Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,1,2,0,0,14513,2007,4,592.2029,0,1,1
1,EZYV02,1,3,1,0,2412,2002,2,83425.6500,3,1,0
2,EZYV03,1,0,0,1,44444,2008,4,122996.8600,3,1,1
3,EZYV04,1,0,0,0,98,1897,4,83434.0300,3,1,1
4,EZYV05,0,3,1,0,1082,2005,3,149907.3900,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,1,0,1,1,2601,2008,3,77092.5700,3,1,0
25476,EZYV25477,1,2,1,0,3274,2006,2,279174.7900,3,1,0
25477,EZYV25478,1,3,1,0,1121,1910,3,146298.8500,3,0,0
25478,EZYV25479,1,3,1,1,1918,1887,4,86154.7700,3,1,0


**Label Encoder**

- The for loop map methoed we explored
- But the LabelEncoder also do the same
- It is under **scikit-learn** we usually called as **sklearn**
- Under sklearn we have class cllad as preprocessing
- Because curently we are doing data processing only
- Under preprocessing 
- sklearn
    - preprocessing
    - LabelEncoder
- Any sklearn package has 3 steps
    - Step-1: Read the package
    - Step-2: Save the package
    - Step-3: Apply fit transform on data


In [29]:
visa_df = pd.read_csv(r'C:\Users\saina\Documents\DataScience\Datafiles\Visadataset.csv')
cat_columns = visa_df.select_dtypes(include='object').columns

# Step-1: Read the package / load the method
from sklearn.preprocessing import LabelEncoder

# Step-2: Save the package / call the method
le = LabelEncoder()

# Step-3: Apply fit transform on data
visa_df['case_status'] = le.fit_transform(visa_df['case_status'])
visa_df

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,1
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,0
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,1
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,1
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,0
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,0
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.8500,Year,N,0
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,0


In [32]:
from sklearn.preprocessing import LabelEncoder
visa_df = pd.read_csv(r'C:\Users\saina\Documents\DataScience\Datafiles\Visadataset.csv')
cat_columns = visa_df.select_dtypes(include='object').columns
le = LabelEncoder()
for i in cat_columns[1:]:
    visa_df[i] = le.fit_transform(visa_df[i])
visa_df

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,1,2,0,0,14513,2007,4,592.2029,0,1,1
1,EZYV02,1,3,1,0,2412,2002,2,83425.6500,3,1,0
2,EZYV03,1,0,0,1,44444,2008,4,122996.8600,3,1,1
3,EZYV04,1,0,0,0,98,1897,4,83434.0300,3,1,1
4,EZYV05,0,3,1,0,1082,2005,3,149907.3900,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,1,0,1,1,2601,2008,3,77092.5700,3,1,0
25476,EZYV25477,1,2,1,0,3274,2006,2,279174.7900,3,1,0
25477,EZYV25478,1,3,1,0,1121,1910,3,146298.8500,3,0,0
25478,EZYV25479,1,3,1,1,1918,1887,4,86154.7700,3,1,0


**fit and transform**

- fit means develop the logic
- transform means process the logic
- If we observe in map method first we developed dictionary logic
- and then we applied that dictionary to column
- Developing the dictionary is kind of a fit
- applying the dictionary to columns means we are transforming the data from cat to num
- whenever we are transformming or changing the data then use **fit_transform**
- whenever we are not tranforming or changing then data then only **fit**

**np.where**

- Using np.where also we can change the categorical data to numerical
- But np.where is only for binary condition
- np.where is same as if-else
- if condition is True: will apply True value
- if condition is False: will apply False value
- So np.where use for ony binary lables
- column should have only two unique lables like case_status in visa_df

In [35]:
# value =='Certified' replace with 0
# otherwise = 1

visa_df = pd.read_csv(r'C:\Users\saina\Documents\DataScience\Datafiles\Visadataset.csv')
cat_columns = visa_df.select_dtypes(include='object').columns
cond = visa_df['case_status'] =='Certified'
visa_df['case_status'] = np.where(cond,0,1)

In [36]:
visa_df

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,1
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,0
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,1
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,1
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,0
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,0
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.8500,Year,N,0
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,0


**One hot encoder**

- One hot means if one will be ON, another will be OFF
- ON represents with 1
- OFF represents with 0
- For example case status has two unique lables
    - Certified
    - Denied
- One hot encoder creates new columns, which is equal to number of unique lables
- For example case_status will create two new extra columns that are
    - case_status_Certified
    - case_status_Denied

|case_status|case_status_Certified|case_status_Denied|
|-|-|-|
|Denied|0|1|
|Certified|1|0|
|Denied|0|1|


In [None]:
visa_df = pd.read_csv(r'C:\Users\saina\Documents\DataScience\Datafiles\Visadataset.csv')
cat_columns = visa_df.select_dtypes(include='object').columns
cond = visa_df['case_status'] =='Certified'
visa_df['case_status'] = np.where(cond,0,1)