In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize

#### Import all the projects json

In [2]:
raw_folder = '../../data/raw/'

In [3]:
projects = ['projectdata2019-05-28-21-53-29.txt', 'projectdata2019-05-29-20-32-05.txt', 'projectdata2019-05-30-18-49-03.txt', 'projectdata2019-06-02-13-27-25.txt']

In [4]:
projects_data = []
for project in projects:
    with open(raw_folder + project) as json_file: 
            project_data = json.load(json_file)
            projects_data.append(project_data['projects'])

In [5]:
len(projects_data)

4

#### Convert the JSON into a pandas dataframe

In [6]:
projects = []
for project_data in projects_data:
    projects.extend(project_data)

In [7]:
len(projects)

1783

In [8]:
projects[0].keys()

dict_keys(['origin', 'description', 'title', 'url', 'country', 'topics', 'tags', 'regions', 'UN_regions', 'guid'])

In [9]:
pd_projects = json_normalize(projects)

In [10]:
pd_projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,url
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[Computers & Technology, Biology, Chemistry]",https://scistarter.com/project/4-Foldit-Foldit
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[Nature & Outdoors, Animals, Ecology & Environ...",https://scistarter.com/project/5-Firefly-Watch...
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[Astronomy & Space, Computers & Technology]",https://scistarter.com/project/6-Galaxy-Zoo-Ga...
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[Ocean, Water, Marine & Terrestrial, Education...",https://scistarter.com/project/7-Natures-Noteb...
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[Computers & Technology, Sound, Physics]",https://scistarter.com/project/8-Laser-Harp-Bu...


In [11]:
len(pd_projects)

1783

#### Remove null projects

In [12]:
null_projects = pd_projects[pd_projects['url'].isnull()]
print(null_projects)
len(null_projects)

     UN_regions country description               error guid origin regions  \
1122        NaN     NaN         NaN  Invalid project id  NaN    NaN     NaN   
1157        NaN     NaN         NaN  Invalid project id  NaN    NaN     NaN   

     tags title topics  url  
1122  NaN   NaN    NaN  NaN  
1157  NaN   NaN    NaN  NaN  


2

In [13]:
non_null_projects = pd_projects[pd_projects['url'].isnull() == False]
len(non_null_projects)

1781

#### Extract project_id from the URL

In [14]:
def extract_project_id(url):
    first_split = url.split('project/')
    project_id = first_split[1].split('-')[0]
    return int(project_id)

In [15]:
extract_project_id('https://scistarter.com/project/280-Seagrass-Watch-Seagrass-Watch')

280

In [16]:
non_null_projects['project_id'] = non_null_projects.apply(lambda x: extract_project_id(x['url']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
non_null_projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,url,project_id
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[Computers & Technology, Biology, Chemistry]",https://scistarter.com/project/4-Foldit-Foldit,4
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[Nature & Outdoors, Animals, Ecology & Environ...",https://scistarter.com/project/5-Firefly-Watch...,5
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[Astronomy & Space, Computers & Technology]",https://scistarter.com/project/6-Galaxy-Zoo-Ga...,6
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[Ocean, Water, Marine & Terrestrial, Education...",https://scistarter.com/project/7-Natures-Noteb...,7
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[Computers & Technology, Sound, Physics]",https://scistarter.com/project/8-Laser-Harp-Bu...,8


#### Save file

In [18]:
non_null_projects.to_pickle('project_data') 