# One-Hot Encode the Topics Attributes

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

### Load in the projects dataframe

In [2]:
projects = pd.read_pickle("../../data/raw/project_data")

### One-hot Encode Topics

<b>Technically</b> - this isn't called one-hot encoding and so, I'll need to write my own stuff for it

#### Get a unique list of topics

In [3]:
topics = list(projects['topics'])
flatten = lambda l: [item for sublist in l for item in sublist]
flattened_topics = flatten(topics)
unique_topics = list(set(flattened_topics))
len(unique_topics)

27

In [4]:
print(unique_topics)

['Ecology & Environment', 'Geology & Earth Science', 'Chemistry', 'Sound', 'Insects & Pollinators', 'Social Science', 'Nature & Outdoors', 'Events', 'Education', 'Transportation', 'Biology', 'Climate & Weather', 'Awards', 'Geography', 'Computers & Technology', 'Agriculture', 'Science Policy', 'Animals', 'Astronomy & Space', 'Birds', 'Physics', 'Archeology & Cultural', 'Food', 'Health & Medicine', 'Ocean, Water, Marine & Terrestrial', 'Psychology', 'Crowd Funding']


#### Prepare column names for the dataframe

In [5]:
# Clean the topics names
def clean_names(topics):
    return [topic.replace(' ', '_').replace('&', 'and') for topic in topics]    

def add_topics_prefix(topics):  
    return ['topics_' + str(topic) for topic in topics] 

In [6]:
cleaned_topics_names = clean_names(unique_topics)
topic_column_names = add_topics_prefix(cleaned_topics_names)

In [7]:
topic_column_names

['topics_Ecology_and_Environment',
 'topics_Geology_and_Earth_Science',
 'topics_Chemistry',
 'topics_Sound',
 'topics_Insects_and_Pollinators',
 'topics_Social_Science',
 'topics_Nature_and_Outdoors',
 'topics_Events',
 'topics_Education',
 'topics_Transportation',
 'topics_Biology',
 'topics_Climate_and_Weather',
 'topics_Awards',
 'topics_Geography',
 'topics_Computers_and_Technology',
 'topics_Agriculture',
 'topics_Science_Policy',
 'topics_Animals',
 'topics_Astronomy_and_Space',
 'topics_Birds',
 'topics_Physics',
 'topics_Archeology_and_Cultural',
 'topics_Food',
 'topics_Health_and_Medicine',
 'topics_Ocean,_Water,_Marine_and_Terrestrial',
 'topics_Psychology',
 'topics_Crowd_Funding']

#### Create and populate dataframe

In [8]:
# Add a zero topics column to every row in the dataframe
for topic_column_name in topic_column_names:
    projects[topic_column_name] = 0

In [9]:
projects['topics'] = projects['topics'].apply(clean_names).apply(add_topics_prefix)

In [10]:
for index, row in projects.iterrows():
    row_topics = row['topics']
    row[row_topics] = 1
    try:
        projects.iloc[index] = row
    except:
        print(index)

1781
1782


In [11]:
projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,...,topics_Animals,topics_Astronomy_and_Space,topics_Birds,topics_Physics,topics_Archeology_and_Cultural,topics_Food,topics_Health_and_Medicine,"topics_Ocean,_Water,_Marine_and_Terrestrial",topics_Psychology,topics_Crowd_Funding
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[topics_Computers_and_Technology, topics_Biolo...",...,0,0,0,0,0,0,0,0,0,0
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[topics_Nature_and_Outdoors, topics_Animals, t...",...,1,0,0,0,0,0,0,0,0,0
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[topics_Astronomy_and_Space, topics_Computers_...",...,0,1,0,0,0,0,0,0,0,0
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[topics_Ocean,_Water,_Marine_and_Terrestrial, ...",...,1,0,1,0,0,0,0,1,0,0
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[topics_Computers_and_Technology, topics_Sound...",...,0,0,0,1,0,0,0,0,0,0


### Save the data to a pickle file

In [12]:
projects.to_pickle('../../data/processed/project_topics_data') 