In [1]:
import json
import pprint
import collections
import requests
import pandas as pd
import numpy as np
from pynpm import NPMPackage

In [2]:
def get_top_blocks(blocks):
    return {bid: block for bid,block in blocks.items() if 'topLevel' in list(block) and block['topLevel']}

In [3]:
def build_stack_sequence(all_blocks, tid):
    """ builds a sequential stack sequence """
    stack = []
    curr_id = tid
    stack.append(all_blocks[curr_id]['opcode'])
    while curr_id is not None:
        curr_id = all_blocks[curr_id]['next']
        if curr_id != None:
            stack.append(all_blocks[curr_id]['opcode'])
    return stack

In [4]:
def build_rnn_sequences(stack, seq_len):
    """
    produces a list of rnn-ready sequences given an input stack sequence
    iterates through
    """
    seqs = []
    for i in range(seq_len):
        seq = stack[i:seq_len]
        if(len(seq)<seq_len):
            seq.extend([None] * (seq_len - len(seq)))
        if(not all(el is None for el in seq)):
            seqs.append(seq)
    return seqs

In [5]:
def get_terminal_blocks(blocks):
    return { k:blocks[k] for k in blocks
             if type(blocks[k]) is dict # prevents stop blocks
             if blocks[k]['next'] is None # nothing after it
             if not blocks[k]['shadow'] # not a shadow block
             if 'operator' not in blocks[k]['opcode'] # not an operator
             if 'SUBSTACK' not in blocks[k]['inputs'] # has no children
             if 'SUBSTACK2' not in blocks[k]['inputs'] }

In [6]:
def filter_blocks(blocks):
    return { k:blocks[k] for k in blocks 
            if type(blocks[k]) is dict # prevents stop blocks
            if 'operator' not in blocks[k]['opcode'] # prevents operators
           }

In [7]:
def get_paths(blks):
    """ returns all of the paths from top->terminal block of a sprite """
    paths = []
    blks = filter_blocks(blks)
    
    # symbols for direction in the tree
    _nest = '>'
    _next = '-'
    
    terminals = get_terminal_blocks(blks)

    for t in terminals:

        # initialize the path ending with the terminal

        curr_parent_id = t
        curr_parent = blks[curr_parent_id]
        next_parent_id = terminals[t]['parent']
        path = [curr_parent['opcode']]
        
        should_dump = False
        
        if next_parent_id is not None and next_parent_id in blks:
            
            next_parent = blks[next_parent_id]
            
            if t == next_parent['next']:
                path.insert(0,_next)
            else:
                path.insert(0,_nest)

            # initializie before traverseing
            path.insert(0,next_parent['opcode'])

            # begin the traversal with the next parent
            curr_parent_id = next_parent_id

            # go up the tree
            while True:
                # set the current parent to its own parent

                # in order to determine nesting / sequence,
                # if the current block id is the same as its parent's next
                # then it's next
                #  if it's not, then it's nested
                next_parent_id = blks[curr_parent_id]['parent']
                
                if next_parent_id is not None and next_parent_id in blks:
                    
                        curr_parent = blks[curr_parent_id]
                        next_parent = blks[next_parent_id]

                        if curr_parent_id == next_parent['next']:
                            path.insert(0,_next)
                        else:
                            path.insert(0,_nest)

                        path.insert(0,next_parent['opcode'])

                        # reset for the next iteration
                        curr_parent_id = next_parent_id
    
                else:
                    break
                    
            if not should_dump:
                paths.append(path)
            else:
                continue
    
    return paths

In [8]:
pp = pprint.PrettyPrinter(indent=4)

In [9]:
project_id_df = pd.read_csv('data/project-ids/project_ids_train_1000.csv')

In [10]:
project_ids = list(project_id_df['id'])

In [11]:
def download_projects(project_ids):
    for pid in project_ids:
        proj_url = 'https://projects.scratch.mit.edu/{}'.format(pid)
        r = requests.get(url = proj_url, params = {}) 
        proj_data = r.json()
        with open('data/project-json/{}.json'.format(pid), 'w') as outfile:
            json.dump(proj_data, outfile)

In [16]:
def make_sequences():
    print('what')
    all_sequences = []

    for pid in project_ids:
        # sending get request and saving the response as response object 
        proj_path = 'data/project-json/{}.json'.format(pid)

        # load the pre-downloaded project json data
        with open(proj_path) as f:

            proj_data = json.load(f)
            sprites = proj_data['targets'][1:] 

            for s in sprites:
                all_blocks = s['blocks']
                paths = get_paths(all_blocks)
                
                # filter out any lone soliders if necessary
                paths = [p for p in paths if len(p) > 1]
                all_sequences.extend(paths)
                
    return all_sequences

In [17]:
def old_sequences():
    for pid in project_ids:
        # sending get request and saving the response as response object 
        proj_path = 'data/project-json/{}.json'.format(pid)

        # load the pre-downloaded project json data
        with open(proj_path) as f:

            proj_data = json.load(f)
            sprites = proj_data['targets'][1:] 

            for s in sprites:
                all_blocks = s['blocks']
                tops = get_top_blocks(all_blocks)
                topids = [bid for bid,block in tops.items()]
                for tid in topids:
                    s = build_stack_sequence(all_blocks, tid)
                    #Only build sequences out of stacks with at least two blocks
                    if(len(s) > 1):
                        seqs = build_rnn_sequences(s,sequence_length)
                        all_sequences.extend(seqs)

# CLEANING

In [164]:
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import TimeseriesGenerator, skipgrams

Using TensorFlow backend.


In [54]:
sequences_t = [[1,2,3],[5,5,5,5,5,56]]

In [59]:
tf.keras.preprocessing.sequence.pad_sequences(sequences_t, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)

array([[ 0,  0,  0,  1,  2,  3],
       [ 5,  5,  5,  5,  5, 56]], dtype=int32)

In [61]:
def reject_outliers(data, m=2):
    return data[abs(np.array(data) - np.mean(data)) < m * np.std(data)]

In [101]:
def pad_seqs(seqs):
    return tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=None, dtype='str', padding='pre', truncating='pre', value=0.0)

In [79]:
all_seqs = make_sequences()

what


In [80]:
lengths = [len(sub) for sub in all_seqs]
mean = np.mean(lengths)
std = np.std(lengths)
med = np.median(lengths)

In [81]:
mean, med, std

(17.37940212275634, 11.0, 17.506860211198592)

In [229]:
# filtered_seqs = [seq for seq in s if abs(len(seq) - np.mean(lengths)) < 2 * np.std(lengths)]
std_dev = 1
filtered_seqs = list(filter(lambda s: abs(len(s) - mean) < std_dev * std, all_seqs))
category_seqs = list(map(lambda s: [b.split('_')[0] for b in s], filtered_seqs))

In [237]:
def make_windows(seq):
    # want to create the windows walking backwards
    # sample path:
    # ['event_whenflagclicked', '-', 'data_setvariableto', '-', 'looks_hide']
    # output:
    # ['event_whenflagclicked', '-', 'data_setvariableto']
    # ['event_whenflagclicked', '-', 'data_setvariableto', '-', 'looks_hide']
    windows = []
    for i in range(0, len(seq)-2, 2):
        w = seq[:len(seq)-i]
        windows.append(w)
    return windows

In [238]:
final_full_seqs = []
for seq in filtered_seqs:
    final_full_seqs.extend(make_windows(seq))

In [239]:
final_category_seqs = []
for seq in category_seqs:
    final_category_seqs.extend(make_windows(seq))

In [240]:
# Padding
final_full_seqs = pad_seqs(final_full_seqs)
final_category_seqs = pad_seqs(final_category_seqs)

In [241]:
f_lengths = [len(sub) for sub in filtered_seqs]

In [242]:
all_blocks = []
for seq in filtered_seqs:
    blocks = set(seq)
    all_blocks.extend(blocks)
block_vocab = list(set(all_blocks))

In [243]:
block2idx = {}
idx2block = {}
for i,b in enumerate(block_vocab):
    block2idx[b] = i
    idx2block[i] = b

In [192]:
def encode_seq(seq):
    """encodes all elements in sequence into integers"""
    return np.array([block2idx[b] for b in seq])

def decode_seq(seq):
    """decodes indices back into block names"""
    return np.array([idx2block[i] for i in seq])

In [247]:
final_full_df = pd.DataFrame(final_full_seqs)
final_category_df = pd.DataFrame(final_category_seqs)

In [250]:
# replace the 0's with empty strings
final_full_df = final_full_df.replace('0.0', '')
final_category_df = final_category_df.replace('0.0', '')

In [252]:
final_full_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,,,,,,,,,,,...,,,,,,event_whenflagclicked,-,data_setvariableto,-,looks_hide
1,,,,,,,,,,,...,,,,,,,,event_whenflagclicked,-,data_setvariableto
2,,,,,,,,,,,...,,,,,,event_whenkeypressed,-,motion_pointindirection,-,motion_movesteps
3,,,,,,,,,,,...,,,,,,,,event_whenkeypressed,-,motion_pointindirection
4,,,,,,,,,,,...,,,,,,event_whenkeypressed,-,motion_movesteps,-,motion_pointindirection


In [253]:
final_category_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,,,,,,,,,,,...,,,,,,event,-,data,-,looks
1,,,,,,,,,,,...,,,,,,,,event,-,data
2,,,,,,,,,,,...,,,,,,event,-,motion,-,motion
3,,,,,,,,,,,...,,,,,,,,event,-,motion
4,,,,,,,,,,,...,,,,,,event,-,motion,-,motion


In [254]:
final_full_df.to_csv('data/sequence-data/1000-windowed-full.csv')

In [255]:
final_category_df.to_csv('data/sequence-data/1000-windowed-categories.csv')