# Encode

In [883]:
import pandas as pd
import numpy as np

In [884]:
df = pd.read_csv("data/Mock-Demographics.csv")
df.head()

Unnamed: 0,Name,Age,Job,Hobbies
0,John Doe,1994,Marketing,"football, tennis"
1,Jane Doe,1988,Sales,Art
2,Mary,1990,Data science,Reading
3,John Doe,1990,Marketing,"football, tennis"
4,Jane Doe,1990,Sales,Art


**Drop name because we don't need it**

In [885]:
df = df.drop("Name", axis=1)

**Convert all strings to lowercase**

In [886]:
#df["Name"] = df["Name"].str.lower()
df["Job"] = df["Job"].str.lower()
df["Hobbies"] = df["Hobbies"].str.lower()

df.head()

Unnamed: 0,Age,Job,Hobbies
0,1994,marketing,"football, tennis"
1,1988,sales,art
2,1990,data science,reading
3,1990,marketing,"football, tennis"
4,1990,sales,art


**Expand Hobbies into 2 columns**

In [887]:
df[["Hobby1", "Hobby2"]] = df.Hobbies.str.split(", ", expand=True,)
df = df.drop("Hobbies", axis=1)

### Create dictionaries of name to int

**Encoder decoder**

This object keeps track of the position of each column in the vector and its length

In [888]:
encoder_decoder = {}

**Age**

In [889]:
ages = df['Age'].value_counts().index
print(ages)

Int64Index([1990, 1994, 1988], dtype='int64')


In [890]:
age_2_int = {age: i for i,age in enumerate(ages, start=1)}
int_2_age = {v: k for k, v in age_2_int.items()}

print(age_2_int)

{1990: 1, 1994: 2, 1988: 3}


In [891]:
encoder_decoder["Age"] = {
    "length": len(age_2_int),
    "position": df.columns.get_loc("Age"),
    "int_2_name": int_2_age
}

print(encoder_decoder)

{'Age': {'length': 3, 'position': 0, 'int_2_name': {1: 1990, 2: 1994, 3: 1988}}}


**Job**

In [892]:
jobs = df['Job'].value_counts().index
print(jobs)

Index(['sales', 'marketing', 'data science'], dtype='object')


In [893]:
job_2_int = {job: i for i,job in enumerate(jobs, start=1)}
int_2_job = {v: k for k, v in job_2_int.items()}

print(job_2_int)

{'sales': 1, 'marketing': 2, 'data science': 3}


In [894]:
encoder_decoder["Job"] = {
    "length": len(job_2_int),
    "position": df.columns.get_loc("Job"),
    "int_2_name": int_2_job
}

print(encoder_decoder)

{'Age': {'length': 3, 'position': 0, 'int_2_name': {1: 1990, 2: 1994, 3: 1988}}, 'Job': {'length': 3, 'position': 1, 'int_2_name': {1: 'sales', 2: 'marketing', 3: 'data science'}}}


**Hobbies**

In [895]:
hobbies1 = df['Hobby1'].value_counts().index
hobbies2 = df['Hobby2'].value_counts().index

all_hobbies = hobbies1.append(hobbies2)

print(all_hobbies)

Index(['football', 'art', 'reading', 'tennis'], dtype='object')


In [896]:
hobby_2_int = {hobby: i for i,hobby in enumerate(all_hobbies, start=1)}
int_2_hobby = {v: k for k, v in hobby_2_int.items()}

print(hobby_2_int)

{'football': 1, 'art': 2, 'reading': 3, 'tennis': 4}


In [897]:
encoder_decoder["Hobby1"] = {
    "length": len(int_2_hobby),
    "position": df.columns.get_loc("Hobby1"),
    "int_2_name": int_2_hobby
}

print(encoder_decoder)

{'Age': {'length': 3, 'position': 0, 'int_2_name': {1: 1990, 2: 1994, 3: 1988}}, 'Job': {'length': 3, 'position': 1, 'int_2_name': {1: 'sales', 2: 'marketing', 3: 'data science'}}, 'Hobby1': {'length': 4, 'position': 2, 'int_2_name': {1: 'football', 2: 'art', 3: 'reading', 4: 'tennis'}}}


In [898]:
encoder_decoder["Hobby2"] = {
    "length": len(int_2_hobby),
    "position": df.columns.get_loc("Hobby2"),
    "int_2_name": int_2_hobby
}

print(encoder_decoder)

{'Age': {'length': 3, 'position': 0, 'int_2_name': {1: 1990, 2: 1994, 3: 1988}}, 'Job': {'length': 3, 'position': 1, 'int_2_name': {1: 'sales', 2: 'marketing', 3: 'data science'}}, 'Hobby1': {'length': 4, 'position': 2, 'int_2_name': {1: 'football', 2: 'art', 3: 'reading', 4: 'tennis'}}, 'Hobby2': {'length': 4, 'position': 3, 'int_2_name': {1: 'football', 2: 'art', 3: 'reading', 4: 'tennis'}}}


### Encode the columns

In [899]:
encoded_items = {
    "Age" : age_2_int,
    "Job" : job_2_int,
    "Hobby1" : hobby_2_int,
    "Hobby2" : hobby_2_int
}

print(encoded_items)

{'Age': {1990: 1, 1994: 2, 1988: 3}, 'Job': {'sales': 1, 'marketing': 2, 'data science': 3}, 'Hobby1': {'football': 1, 'art': 2, 'reading': 3, 'tennis': 4}, 'Hobby2': {'football': 1, 'art': 2, 'reading': 3, 'tennis': 4}}


In [900]:
df = df.replace(encoded_items)

In [901]:
df.head()

Unnamed: 0,Age,Job,Hobby1,Hobby2
0,2,2,1,4.0
1,3,1,2,
2,1,3,3,
3,1,2,1,4.0
4,1,1,2,


**Fill NaN with 0's**

In [902]:
df = df.fillna(0)
print(df.astype(int).dtypes)
df.head()

Age       int64
Job       int64
Hobby1    int64
Hobby2    int64
dtype: object


Unnamed: 0,Age,Job,Hobby1,Hobby2
0,2,2,1,4.0
1,3,1,2,0.0
2,1,3,3,0.0
3,1,2,1,4.0
4,1,1,2,0.0


**One hot encoding**

In [903]:
"""
age_dummies = pd.get_dummies(df["Age"], prefix="Age_")

job_dummies = pd.get_dummies(df["Job"], prefix="Job_")

hobby1_dummies = pd.get_dummies(df["Hobby1"], prefix="Hobby1_")

hobby2_dummies = pd.get_dummies(df["Hobby2"], prefix="Hobby2_")

df = pd.concat([df, age_dummies,job_dummies, hobby1_dummies,hobby2_dummies], axis=1)


## Drop the original columns
df = df.drop(["Age", "Job", "Hobby1", "Hobby2"], axis=1)

df.head()
"""

'\nage_dummies = pd.get_dummies(df["Age"], prefix="Age_")\n\njob_dummies = pd.get_dummies(df["Job"], prefix="Job_")\n\nhobby1_dummies = pd.get_dummies(df["Hobby1"], prefix="Hobby1_")\n\nhobby2_dummies = pd.get_dummies(df["Hobby2"], prefix="Hobby2_")\n\ndf = pd.concat([df, age_dummies,job_dummies, hobby1_dummies,hobby2_dummies], axis=1)\n\n\n## Drop the original columns\ndf = df.drop(["Age", "Job", "Hobby1", "Hobby2"], axis=1)\n\ndf.head()\n'

In [904]:
numpy_data = df.to_numpy(dtype=int)

"

# Decode

Lets decode 1 element

In [905]:
vector = numpy_data[0]

In [906]:
vector

array([2, 2, 1, 4])

**Decode**

In [907]:
pd.DataFrame(columns = ["Age", "Job", "Hobby1", "Hobby2"])

Unnamed: 0,Age,Job,Hobby1,Hobby2


In [908]:
age_encoded = vector[encoder_decoder["Age"]["position"]]
age_decoded = encoder_decoder["Age"]["int_2_name"][age_encoded]

In [909]:
job_encoded = vector[encoder_decoder["Job"]["position"]]
job_decoded = encoder_decoder["Job"]["int_2_name"][job_encoded]

In [910]:
hobby1_encoded = vector[encoder_decoder["Hobby1"]["position"]]
hobby1_decoded = encoder_decoder["Hobby1"]["int_2_name"][hobby1_encoded]

In [911]:
hobby2_encoded = vector[encoder_decoder["Hobby2"]["position"]]
hobby2_decoded = encoder_decoder["Hobby2"]["int_2_name"][hobby2_encoded]

In [912]:
[age_decoded, job_decoded, hobby1_decoded, hobby2_decoded]

[1994, 'marketing', 'football', 'tennis']