In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# One-hot encoding projects tag attributes

### Load in the projects dataframe

In [2]:
projects = pd.read_pickle("../../data/raw/project_data")

In [3]:
projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,url,project_id
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[Computers & Technology, Biology, Chemistry]",https://scistarter.com/project/4-Foldit-Foldit,4
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[Nature & Outdoors, Animals, Ecology & Environ...",https://scistarter.com/project/5-Firefly-Watch...,5
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[Astronomy & Space, Computers & Technology]",https://scistarter.com/project/6-Galaxy-Zoo-Ga...,6
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[Ocean, Water, Marine & Terrestrial, Education...",https://scistarter.com/project/7-Natures-Noteb...,7
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[Computers & Technology, Sound, Physics]",https://scistarter.com/project/8-Laser-Harp-Bu...,8


### One-hot encode tags

#### Get unique list of tags

In [4]:
tags = list(projects['tags'])
flatten = lambda l: [item for sublist in l for item in sublist]
flattened_tags = flatten(tags)
print(len(flattened_tags))
unique_tags = list(set(flattened_tags))
print(len(unique_tags))

9899
3860


#### Prepare columns names for the dataframe

In [5]:
# Clean the tags names
def clean_names(tags):
    return [tag.replace(' ', '_').replace('&', 'and') for tag in tags]   

def add_tags_prefix(tags):  
    return ['tags_' + str(tag) for tag in tags] 

In [6]:
tag_column_names = add_tags_prefix(clean_names(unique_tags))

In [7]:
tag_column_names[:6]

['tags_',
 'tags_satellites',
 'tags_after',
 'tags_hydrology_of_watersheds',
 'tags_skiing',
 'tags_bioengineers']

#### Find any rows where the tags value is not of type list

In [8]:
lengths = []
for i in range(len(projects)):
    project = projects.iloc[i]
    try:
        lengths.append(len(project['tags']))
    except:
        print(i)
len(lengths)

1781

#### Find any tags that are dodgy

#### Create new columns and populate them

In [9]:
# Add a zero tags column to every row in the dataframe
for tag_column_name in tag_column_names:
    projects[tag_column_name] = 0

In [10]:
# Clean up the lists of tags inside the tags column of the dataframe
projects['tags'] = projects['tags'].apply(clean_names).apply(add_tags_prefix)

In [11]:
print(len(tag_column_names))

3860


In [12]:
projects.iloc[0][projects.iloc[0]['tags']] = 1 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_labels(key, value)


In [13]:
for tag_column_name in tag_column_names:  
    projects[tag_column_name] = tag_column_name in projects['tags']

In [14]:
projects.to_pickle('project_tags_data') 