In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# Restructure Project Data (one-hot encoding etc.)

### Load in the projects dataframe

In [2]:
projects = pd.read_pickle("../Data/project_data")

In [3]:
projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,url,project_id
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[Computers & Technology, Biology, Chemistry]",https://scistarter.com/project/4-Foldit-Foldit,4
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[Nature & Outdoors, Animals, Ecology & Environ...",https://scistarter.com/project/5-Firefly-Watch...,5
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[Astronomy & Space, Computers & Technology]",https://scistarter.com/project/6-Galaxy-Zoo-Ga...,6
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[Ocean, Water, Marine & Terrestrial, Education...",https://scistarter.com/project/7-Natures-Noteb...,7
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[Computers & Technology, Sound, Physics]",https://scistarter.com/project/8-Laser-Harp-Bu...,8


### One-hot Encode Topics

<b>Technically</b> - this isn't called one-hot encoding and so, I'll need to write my own stuff for it

#### Get a unique list of topics

In [4]:
topics = list(projects['topics'])

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [6]:
flattened_topics = flatten(topics)

In [7]:
len(flattened_topics)

5949

In [8]:
unique_topics = list(set(flattened_topics))

In [9]:
len(unique_topics)

27

In [10]:
print(unique_topics)

['Health & Medicine', 'Agriculture', 'Ocean, Water, Marine & Terrestrial', 'Physics', 'Transportation', 'Science Policy', 'Climate & Weather', 'Insects & Pollinators', 'Education', 'Computers & Technology', 'Birds', 'Nature & Outdoors', 'Ecology & Environment', 'Awards', 'Social Science', 'Crowd Funding', 'Events', 'Sound', 'Psychology', 'Chemistry', 'Animals', 'Food', 'Astronomy & Space', 'Archeology & Cultural', 'Geography', 'Geology & Earth Science', 'Biology']


#### Prepare column names for the dataframe

In [11]:
# Clean the topics names
def clean_names(topics):
    return [topic.replace(' ', '_').replace('&', 'and') for topic in topics]      

In [12]:
cleaned_topics_names = clean_names(unique_topics)

In [13]:
def add_topics_prefix(topics):  
    return ['topics_' + str(topic) for topic in topics] 

In [14]:
topic_column_names = add_topics_prefix(cleaned_topics_names)

In [15]:
topic_column_names

['topics_Health_and_Medicine',
 'topics_Agriculture',
 'topics_Ocean,_Water,_Marine_and_Terrestrial',
 'topics_Physics',
 'topics_Transportation',
 'topics_Science_Policy',
 'topics_Climate_and_Weather',
 'topics_Insects_and_Pollinators',
 'topics_Education',
 'topics_Computers_and_Technology',
 'topics_Birds',
 'topics_Nature_and_Outdoors',
 'topics_Ecology_and_Environment',
 'topics_Awards',
 'topics_Social_Science',
 'topics_Crowd_Funding',
 'topics_Events',
 'topics_Sound',
 'topics_Psychology',
 'topics_Chemistry',
 'topics_Animals',
 'topics_Food',
 'topics_Astronomy_and_Space',
 'topics_Archeology_and_Cultural',
 'topics_Geography',
 'topics_Geology_and_Earth_Science',
 'topics_Biology']

#### Create and populate dataframe

In [16]:
# Add a zero topics column to every row in the dataframe
for topic_column_name in topic_column_names:
    projects[topic_column_name] = 0

In [17]:
projects['topics'] = projects['topics'].apply(clean_names).apply(add_topics_prefix)

In [18]:
projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,...,topics_Sound,topics_Psychology,topics_Chemistry,topics_Animals,topics_Food,topics_Astronomy_and_Space,topics_Archeology_and_Cultural,topics_Geography,topics_Geology_and_Earth_Science,topics_Biology
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[topics_Computers_and_Technology, topics_Biolo...",...,0,0,0,0,0,0,0,0,0,0
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[topics_Nature_and_Outdoors, topics_Animals, t...",...,0,0,0,0,0,0,0,0,0,0
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[topics_Astronomy_and_Space, topics_Computers_...",...,0,0,0,0,0,0,0,0,0,0
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[topics_Ocean,_Water,_Marine_and_Terrestrial, ...",...,0,0,0,0,0,0,0,0,0,0
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[topics_Computers_and_Technology, topics_Sound...",...,0,0,0,0,0,0,0,0,0,0


In [19]:
for index, row in projects.iterrows():
    row_topics = row['topics']
    row[row_topics] = 1
    try:
        projects.iloc[index] = row
    except:
        print(index)

1781
1782


In [20]:
projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,...,topics_Sound,topics_Psychology,topics_Chemistry,topics_Animals,topics_Food,topics_Astronomy_and_Space,topics_Archeology_and_Cultural,topics_Geography,topics_Geology_and_Earth_Science,topics_Biology
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[topics_Computers_and_Technology, topics_Biolo...",...,0,0,1,0,0,0,0,0,0,1
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[topics_Nature_and_Outdoors, topics_Animals, t...",...,0,0,0,1,0,0,0,0,0,1
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[topics_Astronomy_and_Space, topics_Computers_...",...,0,0,0,0,0,1,0,0,0,0
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[topics_Ocean,_Water,_Marine_and_Terrestrial, ...",...,0,0,0,1,0,0,0,0,0,1
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[topics_Computers_and_Technology, topics_Sound...",...,1,0,0,0,0,0,0,0,0,0


### Save the data to a pickle file

In [21]:
projects.to_pickle('project_topics_data') 