In [52]:
import json
import pprint
import collections
import requests
import pandas as pd
import numpy as np
from pynpm import NPMPackage

In [54]:
def get_top_blocks(blocks):
    return {bid: block for bid,block in blocks.items() if 'topLevel' in list(block) and block['topLevel']}

In [56]:
def build_stack_sequence(all_blocks, tid):
    stack = []
    curr_id = tid
    stack.append(all_blocks[curr_id]['opcode'])
    while curr_id is not None:
        curr_id = all_blocks[curr_id]['next']
        if curr_id != None:
            stack.append(all_blocks[curr_id]['opcode'])
    return stack

In [66]:
def build_rnn_sequences(stack, seq_len):
    """
    produces a list of rnn-ready sequences given an input stack sequence
    iterates through
    """
    seqs = []
    for i in range(seq_len):
        seq = stack[i:seq_len]
        if(len(seq)<seq_len):
            seq.extend(['none'] * (seq_len - len(seq)))
        
        if(not all(el == 'none' for el in seq)):
            seqs.append(seq)
    return seqs

In [67]:
pp = pprint.PrettyPrinter(indent=4)

In [68]:
project_id_df = pd.read_csv('data/project-ids/project_ids_train_1000.csv')

In [75]:
project_ids = list(project_id_df['id'])

In [101]:
def download_projects(project_ids):
    for pid in project_ids:
        proj_url = 'https://projects.scratch.mit.edu/{}'.format(pid)
        r = requests.get(url = proj_url, params = {}) 
        proj_data = r.json()
        with open('data/project-json/{}.json'.format(pid), 'w') as outfile:
            json.dump(proj_data, outfile)

In [139]:
all_sequences = []
sequence_length = 5

In [140]:
for pid in project_ids:
    # sending get request and saving the response as response object 
    proj_path = 'data/project-json/{}.json'.format(pid)
    
    # load the pre-downloaded project json data
    with open(proj_path) as f:
    
        proj_data = json.load(f)
        sprites = proj_data['targets'][1:] 
        print("\nProject: {}".format(pid))
        
        for s in sprites:
            all_blocks = s['blocks']
            tops = get_top_blocks(all_blocks)
            topids = [bid for bid,block in tops.items()]
            for tid in topids:
                s = build_stack_sequence(all_blocks, tid)
                #Only build sequences out of stacks with at least two blocks
                if(len(s) > 1):
                    seqs = build_rnn_sequences(s,sequence_length)
                    all_sequences.extend(seqs)


Project: 286948888

Project: 277409223

Project: 295250542

Project: 295211089

Project: 295031935

Project: 291430811

Project: 291430783

Project: 291430936

Project: 291430789

Project: 283223826

Project: 289008324

Project: 294271705

Project: 277520824

Project: 295216675

Project: 278076135

Project: 295152592

Project: 283287764

Project: 295258737

Project: 295256583

Project: 295013140

Project: 278714477

Project: 279004113

Project: 279021483

Project: 292473996

Project: 279019621

Project: 278397380

Project: 285713789

Project: 278969462

Project: 295020108

Project: 295129645

Project: 294983176

Project: 294940296

Project: 278086053

Project: 295162208

Project: 277774667

Project: 278992007

Project: 295172005

Project: 295173317

Project: 291377379

Project: 295200298

Project: 295222842

Project: 279002826

Project: 291149869

Project: 278699366

Project: 279005963

Project: 277245985

Project: 283811435

Project: 288348920

Project: 294582442

Project: 279016340


In [144]:
sequence_df = pd.DataFrame(all_sequences)

In [143]:
sequence_df.to_csv('data/sequence-data/1000-trial-1')