In [147]:
import json
import pprint
import collections
import requests
import pandas as pd
import numpy as np
from pynpm import NPMPackage

In [148]:
def get_top_blocks(blocks):
    return {bid: block for bid,block in blocks.items() if 'topLevel' in list(block) and block['topLevel']}

In [149]:
def build_stack_sequence(all_blocks, tid):
    """ builds a sequential stack sequence """
    stack = []
    curr_id = tid
    stack.append(all_blocks[curr_id]['opcode'])
    while curr_id is not None:
        curr_id = all_blocks[curr_id]['next']
        if curr_id != None:
            stack.append(all_blocks[curr_id]['opcode'])
    return stack

In [150]:
def build_rnn_sequences(stack, seq_len):
    """
    produces a list of rnn-ready sequences given an input stack sequence
    iterates through
    """
    seqs = []
    for i in range(seq_len):
        seq = stack[i:seq_len]
        if(len(seq)<seq_len):
            seq.extend([None] * (seq_len - len(seq)))
        if(not all(el is None for el in seq)):
            seqs.append(seq)
    return seqs

In [151]:
def get_terminal_blocks(blocks):
    return { k:blocks[k] for k in blocks
             if type(blocks[k]) is dict # prevents stop blocks
             if blocks[k]['next'] is None # nothing after it
             if not blocks[k]['shadow'] # not a shadow block
             if 'operator' not in blocks[k]['opcode'] # not an operator
             if 'SUBSTACK' not in blocks[k]['inputs'] # has no children
             if 'SUBSTACK2' not in blocks[k]['inputs'] }

In [152]:
def filter_blocks(blocks):
    return { k:blocks[k] for k in blocks 
            if type(blocks[k]) is dict # prevents stop blocks
            if 'operator' not in blocks[k]['opcode'] # prevents operators
           }

In [153]:
def get_paths(blks):
    """ returns all of the paths from top->terminal block of a sprite """
    paths = []
    blks = filter_blocks(blks)
    
    # symbols for direction in the tree
    _nest = '>'
    _next = '-'
    
    terminals = get_terminal_blocks(blks)

    for t in terminals:

        # initialize the path ending with the terminal

        curr_parent_id = t
        curr_parent = blks[curr_parent_id]
        next_parent_id = terminals[t]['parent']
        path = [curr_parent['opcode']]
        
        should_dump = False
        
        if next_parent_id is not None and next_parent_id in blks:
            
            next_parent = blks[next_parent_id]
            
            if t == next_parent['next']:
                path.insert(0,_next)
            else:
                path.insert(0,_nest)

            # initializie before traverseing
            path.insert(0,next_parent['opcode'])

            # begin the traversal with the next parent
            curr_parent_id = next_parent_id

            # go up the tree
            while True:
                # set the current parent to its own parent

                # in order to determine nesting / sequence,
                # if the current block id is the same as its parent's next
                # then it's next
                #  if it's not, then it's nested
                next_parent_id = blks[curr_parent_id]['parent']
                
                if next_parent_id is not None and next_parent_id in blks:
                    
                        curr_parent = blks[curr_parent_id]
                        next_parent = blks[next_parent_id]

                        if curr_parent_id == next_parent['next']:
                            path.insert(0,_next)
                        else:
                            path.insert(0,_nest)

                        path.insert(0,next_parent['opcode'])

                        # reset for the next iteration
                        curr_parent_id = next_parent_id
    
                else:
                    break
                    
            if not should_dump:
                paths.append(path)
            else:
                continue
    
    return paths

In [154]:
pp = pprint.PrettyPrinter(indent=4)

In [155]:
project_id_df = pd.read_csv('data/project-ids/project_ids_train_1000.csv')

In [156]:
project_ids = list(project_id_df['id'])

In [157]:
def download_projects(project_ids):
    for pid in project_ids:
        proj_url = 'https://projects.scratch.mit.edu/{}'.format(pid)
        r = requests.get(url = proj_url, params = {}) 
        proj_data = r.json()
        with open('data/project-json/{}.json'.format(pid), 'w') as outfile:
            json.dump(proj_data, outfile)

In [158]:
def make_sequences(sequence_length):
    print('what')
    all_sequences = []

    for pid in project_ids:
        # sending get request and saving the response as response object 
        proj_path = 'data/project-json/{}.json'.format(pid)

        # load the pre-downloaded project json data
        with open(proj_path) as f:

            proj_data = json.load(f)
            sprites = proj_data['targets'][1:] 

            for s in sprites:
                all_blocks = s['blocks']
                paths = get_paths(all_blocks)
                
                # filter out any lone soliders if necessary
                paths = [p for p in paths if len(p) > 1]
                all_sequences.extend(paths)
    return all_sequences

In [159]:
def old_sequences():
    for pid in project_ids:
        # sending get request and saving the response as response object 
        proj_path = 'data/project-json/{}.json'.format(pid)

        # load the pre-downloaded project json data
        with open(proj_path) as f:

            proj_data = json.load(f)
            sprites = proj_data['targets'][1:] 

            for s in sprites:
                all_blocks = s['blocks']
                tops = get_top_blocks(all_blocks)
                topids = [bid for bid,block in tops.items()]
                for tid in topids:
                    s = build_stack_sequence(all_blocks, tid)
                    #Only build sequences out of stacks with at least two blocks
                    if(len(s) > 1):
                        seqs = build_rnn_sequences(s,sequence_length)
                        all_sequences.extend(seqs)

In [160]:
import numpy as np

In [161]:
def reject_outliers(data, m=2):
    return data[abs(np.array(data) - np.mean(data)) < m * np.std(data)]

In [162]:
all_seqs = make_sequences(5)

what


In [163]:
lengths = [len(sub) for sub in all_seqs]
mean = np.mean(lengths)
std = np.std(lengths)

In [164]:
mean

17.37940212275634

In [165]:
std

17.506860211198592

In [170]:
# filtered_seqs = [seq for seq in s if abs(len(seq) - np.mean(lengths)) < 2 * np.std(lengths)]
std_dev = 1
filtered_seqs = list(filter(lambda s: abs(len(s) - mean) < std_dev * std, all_seqs))

In [173]:
category_seqs = list(map(lambda s: [b.split('_')[0] for b in s], filtered_seqs))

In [174]:
category_seqs

[['event', '-', 'data', '-', 'looks'],
 ['event', '-', 'motion', '-', 'motion'],
 ['event', '-', 'motion', '-', 'motion'],
 ['event', '-', 'looks', '-', 'motion', '-', 'looks'],
 ['event', '-', 'control', '>', 'control', '>', 'sensing'],
 ['event',
  '-',
  'control',
  '>',
  'control',
  '>',
  'looks',
  '-',
  'data',
  '-',
  'looks'],
 ['event', '-', 'control', '>', 'control', '>', 'sensing'],
 ['event',
  '-',
  'control',
  '>',
  'control',
  '>',
  'looks',
  '-',
  'data',
  '-',
  'looks'],
 ['event', '-', 'control', '>', 'control', '>', 'sensing'],
 ['event',
  '-',
  'control',
  '>',
  'control',
  '>',
  'looks',
  '-',
  'data',
  '-',
  'looks'],
 ['event', '-', 'control', '>', 'control', '>', 'sensing'],
 ['event',
  '-',
  'control',
  '>',
  'control',
  '>',
  'looks',
  '-',
  'data',
  '-',
  'looks'],
 ['event', '-', 'control', '>', 'control', '>', 'sensing'],
 ['event',
  '-',
  'control',
  '>',
  'control',
  '>',
  'looks',
  '-',
  'data',
  '-',
  'looks'

In [167]:
## make the relevant sequences

In [175]:
filtered_sequence_df = pd.DataFrame(filtered_seqs)

In [176]:
category_sequence_df = pd.DataFrame(category_seqs)

In [179]:
filtered_sequence_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,event_whenflagclicked,-,data_setvariableto,-,looks_hide,,,,,,...,,,,,,,,,,
1,event_whenkeypressed,-,motion_pointindirection,-,motion_movesteps,,,,,,...,,,,,,,,,,
2,event_whenkeypressed,-,motion_movesteps,-,motion_pointindirection,,,,,,...,,,,,,,,,,
3,event_whenkeypressed,-,looks_switchcostumeto,-,motion_gotoxy,-,looks_show,,,,...,,,,,,,,,,
4,event_whenflagclicked,-,control_forever,>,control_if,>,sensing_touchingobject,,,,...,,,,,,,,,,


In [180]:
category_sequence_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,event,-,data,-,looks,,,,,,...,,,,,,,,,,
1,event,-,motion,-,motion,,,,,,...,,,,,,,,,,
2,event,-,motion,-,motion,,,,,,...,,,,,,,,,,
3,event,-,looks,-,motion,-,looks,,,,...,,,,,,,,,,
4,event,-,control,>,control,>,sensing,,,,...,,,,,,,,,,


In [183]:
filtered_sequence_df.to_csv('data/sequence-data/1000-trial-terminals-full.csv')

In [184]:
category_sequence_df.to_csv('data/sequence-data/1000-trial-terminals-categories.csv')

In [143]:
padded_seqs = list(sequence_df.values)

In [144]:
# now you have to go from the raw sequences to the actual data for an RNN.

In [137]:
windowed_seqs = []
for p in padded_seqs:
    windows = build_rnn_sequences(list(p), sequence_df.shape[1])
    windows = [w for w in windows if not all()]
    windowed_seqs.extend(list(windows))