# One-hot encoding projects tag attributes

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

### Load in the projects dataframe

In [2]:
projects = pd.read_pickle("../../data/raw/project_data")

#### Get unique list of tags

In [3]:
tags = list(projects['tags'])
flatten = lambda l: [item for sublist in l for item in sublist]
flattened_tags = flatten(tags)
unique_tags = list(set(flattened_tags))
print(len(unique_tags))

3860


#### Prepare columns names for the dataframe

In [4]:
# Clean the tags names
def clean_names(tags):
    return [tag.replace(' ', '_').replace('&', 'and') for tag in tags]   

def add_tags_prefix(tags):  
    return ['tags_' + str(tag) for tag in tags] 

In [5]:
tag_column_names = add_tags_prefix(clean_names(unique_tags))

In [6]:
tag_column_names[:6]

['tags_',
 'tags_training',
 'tags_algonquin_provincial_park',
 'tags_native_bees',
 'tags_conservation_biology',
 'tags_floodcrowd']

#### Find any rows where the tags value is not of type list

In [7]:
lengths = []
for i in range(len(projects)):
    project = projects.iloc[i]
    try:
        lengths.append(len(project['tags']))
    except:
        print(i)
len(lengths)

1781

#### Create new columns and populate them

In [8]:
# Add a zero tags column to every row in the dataframe
for tag_column_name in tag_column_names:
    projects[tag_column_name] = 0

In [9]:
# Clean up the lists of tags inside the tags column of the dataframe
projects['tags'] = projects['tags'].apply(clean_names).apply(add_tags_prefix)

In [10]:
print(len(tag_column_names))

3860


In [11]:
projects.iloc[0][projects.iloc[0]['tags']] = 1 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)


In [12]:
for tag_column_name in tag_column_names:  
    projects[tag_column_name] = tag_column_name in projects['tags']

In [13]:
projects.to_pickle('../../data/processed/project_tags_data') 