# Recoding Data in Python

[Pandas Docs](https://pandas.pydata.org/docs/index.html) 

[Pandas Get Dummies](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html)

Computers prefer numbers to strings and characters.

The goal of recoding is often to produce a transformed dataset which is wholly numeric and does not contain missing data.

Data can be recoded in various ways.

Common mistakes when recoding include:
- Inappropriate recode
- Not changing in place or failing to reassign to a new dataframe
- Not mapping all options in the response space

In [1]:
# Import required packages
import pandas as pd

In [2]:
# read in csv data
df = pd.read_csv('../../Data/glassdoor.csv')

In [3]:
# View data head
df.head()

Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,Valid_until,Job_Type
0,Chief Marketing Officer (CMO),National Debt Relief,NY,New York,-1,-1,Who We're Looking For:\n\nThe Chief Marketing ...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME
1,Registered Nurse,Queens Boulevard Endoscopy Center,NY,Rego Park,-1,-1,"Queens Boulevard Endoscopy Center, an endoscop...",,3.0,4/25/2020,6/7/2020,FULL_TIME
2,Dental Hygienist,Batista Dental,NJ,West New York,-1,-1,Part-time or Full-timedental hygienist positio...,,,5/2/2020,6/7/2020,PART_TIME
3,Senior Salesforce Developer,National Debt Relief,NY,New York,44587,82162,Principle Duties & Responsibilities:\n\nAnalyz...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME
4,"DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...",National Advocates for Pregnant Women,NY,New York,125410,212901,"For FULL Job Announcement, visit our website: ...",,,4/28/2020,6/7/2020,FULL_TIME


# Recoding into a New Column

In [36]:
# Get value counts for State
df.State.value_counts(dropna=False)

NY    750
NJ    150
Name: State, dtype: int64

In [5]:
# Create function to recode State
def state (series):
    if series=="NY":
        return 0
    if series == "NJ":
        return 1

In [6]:
# Apply function to recode state
df['StateR'] = df['State'].apply(state)

In [7]:
# View updated dataframe
df.head()

Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,Valid_until,Job_Type,StateR
0,Chief Marketing Officer (CMO),National Debt Relief,NY,New York,-1,-1,Who We're Looking For:\n\nThe Chief Marketing ...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME,0
1,Registered Nurse,Queens Boulevard Endoscopy Center,NY,Rego Park,-1,-1,"Queens Boulevard Endoscopy Center, an endoscop...",,3.0,4/25/2020,6/7/2020,FULL_TIME,0
2,Dental Hygienist,Batista Dental,NJ,West New York,-1,-1,Part-time or Full-timedental hygienist positio...,,,5/2/2020,6/7/2020,PART_TIME,1
3,Senior Salesforce Developer,National Debt Relief,NY,New York,44587,82162,Principle Duties & Responsibilities:\n\nAnalyz...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME,0
4,"DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...",National Advocates for Pregnant Women,NY,New York,125410,212901,"For FULL Job Announcement, visit our website: ...",,,4/28/2020,6/7/2020,FULL_TIME,0


In [8]:
# Get value counts for Job Type
df.Job_Type.value_counts(dropna=False)

FULL_TIME    755
PART_TIME    120
OTHER         25
Name: Job_Type, dtype: int64

In [9]:
# Recode FULL_TIME as 1, all others as O
df['FullTime']=df['Job_Type'].apply(lambda x: 1 if x == "FULL_TIME" else 0)

In [10]:
# View updated dataframe
df.head()

Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,Valid_until,Job_Type,StateR,FullTime
0,Chief Marketing Officer (CMO),National Debt Relief,NY,New York,-1,-1,Who We're Looking For:\n\nThe Chief Marketing ...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME,0,1
1,Registered Nurse,Queens Boulevard Endoscopy Center,NY,Rego Park,-1,-1,"Queens Boulevard Endoscopy Center, an endoscop...",,3.0,4/25/2020,6/7/2020,FULL_TIME,0,1
2,Dental Hygienist,Batista Dental,NJ,West New York,-1,-1,Part-time or Full-timedental hygienist positio...,,,5/2/2020,6/7/2020,PART_TIME,1,0
3,Senior Salesforce Developer,National Debt Relief,NY,New York,44587,82162,Principle Duties & Responsibilities:\n\nAnalyz...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME,0,1
4,"DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...",National Advocates for Pregnant Women,NY,New York,125410,212901,"For FULL Job Announcement, visit our website: ...",,,4/28/2020,6/7/2020,FULL_TIME,0,1


# Recoding inplace

In [37]:
# Create dictionary to replace values in City
cleanup = {"City" : 
           {"New York": 0, 
            "Rego Park": 1, 
            "Staten Island" : 2, 
            "Brooklyn" : 3, 
            "Mamaroneck" : 4, 
            "Lynbrook" : 5, 
            "West New York": 6, 
            "Fort Lee": 7, 
            "Williston Park": 8, 
            "Maspeth": 9, 
            "Jersey City": 10, 
            "West Orange": 11, 
            "Bronx": 12, 
            "Paramus": 13}
          }

In [38]:
# Apply dictionary replace
df.replace(cleanup, inplace=True)

In [12]:
# View updated dataframe
df.head()

Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,Valid_until,Job_Type,StateR,FullTime
0,Chief Marketing Officer (CMO),National Debt Relief,NY,0,-1,-1,Who We're Looking For:\n\nThe Chief Marketing ...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME,0,1
1,Registered Nurse,Queens Boulevard Endoscopy Center,NY,1,-1,-1,"Queens Boulevard Endoscopy Center, an endoscop...",,3.0,4/25/2020,6/7/2020,FULL_TIME,0,1
2,Dental Hygienist,Batista Dental,NJ,6,-1,-1,Part-time or Full-timedental hygienist positio...,,,5/2/2020,6/7/2020,PART_TIME,1,0
3,Senior Salesforce Developer,National Debt Relief,NY,0,44587,82162,Principle Duties & Responsibilities:\n\nAnalyz...,Finance,4.0,5/8/2020,6/7/2020,FULL_TIME,0,1
4,"DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...",National Advocates for Pregnant Women,NY,0,125410,212901,"For FULL Job Announcement, visit our website: ...",,,4/28/2020,6/7/2020,FULL_TIME,0,1


# Dummy Coding Variables

In [13]:
# Get unique values from Industry
df.Industry.unique()

array(['Finance', nan, 'Health Care',
       'Construction, Repair & Maintenance', 'Information Technology',
       'Telecommunications', 'Biotech & Pharmaceuticals',
       'Business Services', 'Retail'], dtype=object)

In [39]:
# Dummy code Industry (creates a new df)
IndustryDummy = pd.get_dummies(df['Industry'],drop_first=True)

In [15]:
# View new dataframe
IndustryDummy.head()

Unnamed: 0,Business Services,"Construction, Repair & Maintenance",Finance,Health Care,Information Technology,Retail,Telecommunications
0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0


Notice 'Biotech & Pharmaceuticals' is dropped from the returned dataframe

In [40]:
# Merge original and dummy coded dfs into new df
df1 = pd.concat([df, IndustryDummy],axis=1)

In [17]:
# View merged df
df1.head()

Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,...,Job_Type,StateR,FullTime,Business Services,"Construction, Repair & Maintenance",Finance,Health Care,Information Technology,Retail,Telecommunications
0,Chief Marketing Officer (CMO),National Debt Relief,NY,0,-1,-1,Who We're Looking For:\n\nThe Chief Marketing ...,Finance,4.0,5/8/2020,...,FULL_TIME,0,1,0,0,1,0,0,0,0
1,Registered Nurse,Queens Boulevard Endoscopy Center,NY,1,-1,-1,"Queens Boulevard Endoscopy Center, an endoscop...",,3.0,4/25/2020,...,FULL_TIME,0,1,0,0,0,0,0,0,0
2,Dental Hygienist,Batista Dental,NJ,6,-1,-1,Part-time or Full-timedental hygienist positio...,,,5/2/2020,...,PART_TIME,1,0,0,0,0,0,0,0,0
3,Senior Salesforce Developer,National Debt Relief,NY,0,44587,82162,Principle Duties & Responsibilities:\n\nAnalyz...,Finance,4.0,5/8/2020,...,FULL_TIME,0,1,0,0,1,0,0,0,0
4,"DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...",National Advocates for Pregnant Women,NY,0,125410,212901,"For FULL Job Announcement, visit our website: ...",,,4/28/2020,...,FULL_TIME,0,1,0,0,0,0,0,0,0


## Continuous to Categorical with the Max Salary Variable

In [18]:
# Create function to recode salary
def salary (series): 
    if series < 30000:
        return 0
    if series >= 60000: 
        return 1

In [19]:
# Apply function to df-create new column
df1['Max_SalaryR'] = df1['Max_Salary'].apply(salary)

In [20]:
# View updated df
df1.head()

Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,...,StateR,FullTime,Business Services,"Construction, Repair & Maintenance",Finance,Health Care,Information Technology,Retail,Telecommunications,Max_SalaryR
0,Chief Marketing Officer (CMO),National Debt Relief,NY,0,-1,-1,Who We're Looking For:\n\nThe Chief Marketing ...,Finance,4.0,5/8/2020,...,0,1,0,0,1,0,0,0,0,0.0
1,Registered Nurse,Queens Boulevard Endoscopy Center,NY,1,-1,-1,"Queens Boulevard Endoscopy Center, an endoscop...",,3.0,4/25/2020,...,0,1,0,0,0,0,0,0,0,0.0
2,Dental Hygienist,Batista Dental,NJ,6,-1,-1,Part-time or Full-timedental hygienist positio...,,,5/2/2020,...,1,0,0,0,0,0,0,0,0,0.0
3,Senior Salesforce Developer,National Debt Relief,NY,0,44587,82162,Principle Duties & Responsibilities:\n\nAnalyz...,Finance,4.0,5/8/2020,...,0,1,0,0,1,0,0,0,0,1.0
4,"DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...",National Advocates for Pregnant Women,NY,0,125410,212901,"For FULL Job Announcement, visit our website: ...",,,4/28/2020,...,0,1,0,0,0,0,0,0,0,1.0


In [21]:
# Get value counts for Max Salary recoded 
df1.Max_SalaryR.value_counts(dropna=False)

0.0    445
1.0    335
NaN    120
Name: Max_SalaryR, dtype: int64

# Filtering to columns of interest

In [22]:
# View list of columns of df
df1.columns

Index(['Job_title', 'Company', 'State', 'City', 'Min_Salary', 'Max_Salary',
       'Job_Desc', 'Industry', 'Rating', 'Date_Posted', 'Valid_until',
       'Job_Type', 'StateR', 'FullTime', 'Business Services',
       'Construction, Repair & Maintenance', 'Finance', 'Health Care',
       'Information Technology', 'Retail', 'Telecommunications',
       'Max_SalaryR'],
      dtype='object')

In [23]:
# Filter only the columns we want to keep
df2 = df1.filter(['City', 'Min_Salary', 'Max_Salary', 'Rating', 'StateR', 'Business Services',
       'Construction, Repair & Maintenance', 'Finance', 'Health Care',
       'Information Technology', 'Retail', 'Telecommunications', 'FullTime',
       'Max_SalaryR'])

In [41]:
# View new df
df2

Unnamed: 0,City,Min_Salary,Max_Salary,Rating,StateR,Business Services,"Construction, Repair & Maintenance",Finance,Health Care,Information Technology,Retail,Telecommunications,FullTime,Max_SalaryR
0,0,-1,-1,4.000000,0,0,0,1,0,0,0,0,1,0.0
1,1,-1,-1,3.000000,0,0,0,0,0,0,0,0,1,0.0
2,6,-1,-1,3.922727,1,0,0,0,0,0,0,0,0,0.0
3,0,44587,82162,4.000000,0,0,0,1,0,0,0,0,1,1.0
4,0,125410,212901,3.922727,0,0,0,0,0,0,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,10,48000,75000,4.500000,1,0,0,0,1,0,0,0,1,1.0
896,1,55069,74745,3.000000,0,0,0,0,0,0,0,0,1,1.0
897,12,-1,-1,3.922727,0,0,0,0,0,0,0,0,1,0.0
898,3,-1,-1,4.000000,0,0,0,0,1,0,0,0,1,0.0


# Working with Missing Data

In [25]:
# Check shape of current data
df2.shape

(900, 14)

In [26]:
# Create new df with all rows with a nan dropped
df3 = df2.dropna()

In [27]:
# Compare shape before and after dropping data
df3.shape

(570, 14)

In [28]:
# Identify where null values are found in data
df2.isnull().sum()

City                                    0
Min_Salary                              0
Max_Salary                              0
Rating                                240
StateR                                  0
Business Services                       0
Construction, Repair & Maintenance      0
Finance                                 0
Health Care                             0
Information Technology                  0
Retail                                  0
Telecommunications                      0
FullTime                                0
Max_SalaryR                           120
dtype: int64

In [30]:
# Fill Max Salary Recode in place with 0's
df2['Max_SalaryR'].fillna(0, inplace=True)

In [31]:
# Verify that nans have been filled for max salary recoded
df2.isnull().sum()

City                                    0
Min_Salary                              0
Max_Salary                              0
Rating                                240
StateR                                  0
Business Services                       0
Construction, Repair & Maintenance      0
Finance                                 0
Health Care                             0
Information Technology                  0
Retail                                  0
Telecommunications                      0
FullTime                                0
Max_SalaryR                             0
dtype: int64

In [32]:
# Fill Rating with mean of Rating column inplace
df2['Rating'].fillna(df2['Rating'].mean(), inplace=True)

In [33]:
# Verify that all nans have been removed from data
df2.isnull().sum()

City                                  0
Min_Salary                            0
Max_Salary                            0
Rating                                0
StateR                                0
Business Services                     0
Construction, Repair & Maintenance    0
Finance                               0
Health Care                           0
Information Technology                0
Retail                                0
Telecommunications                    0
FullTime                              0
Max_SalaryR                           0
dtype: int64

In [42]:
# View cleaned data
df2.head()

Unnamed: 0,City,Min_Salary,Max_Salary,Rating,StateR,Business Services,"Construction, Repair & Maintenance",Finance,Health Care,Information Technology,Retail,Telecommunications,FullTime,Max_SalaryR
0,0,-1,-1,4.0,0,0,0,1,0,0,0,0,1,0.0
1,1,-1,-1,3.0,0,0,0,0,0,0,0,0,1,0.0
2,6,-1,-1,3.922727,1,0,0,0,0,0,0,0,0,0.0
3,0,44587,82162,4.0,0,0,0,1,0,0,0,0,1,1.0
4,0,125410,212901,3.922727,0,0,0,0,0,0,0,0,1,1.0


In [43]:
# View tail of cleaned data
df2.tail()

Unnamed: 0,City,Min_Salary,Max_Salary,Rating,StateR,Business Services,"Construction, Repair & Maintenance",Finance,Health Care,Information Technology,Retail,Telecommunications,FullTime,Max_SalaryR
895,10,48000,75000,4.5,1,0,0,0,1,0,0,0,1,1.0
896,1,55069,74745,3.0,0,0,0,0,0,0,0,0,1,1.0
897,12,-1,-1,3.922727,0,0,0,0,0,0,0,0,1,0.0
898,3,-1,-1,4.0,0,0,0,0,1,0,0,0,1,0.0
899,4,21402,52210,3.9,0,0,0,0,0,0,1,0,1,0.0


# Congratulations! The data is now ready for analysis!