# Part 1 - Acquire and Parse Data

Preprocess Data

In [1]:
%load_ext autoreload
%autoreload 2

import ast
import glob
import re
from pathlib import Path

import astor
import pandas as pd
import spacy
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

from general_utils import apply_parallel, flattenlist

EN = spacy.load('en_core_web_md')

In [2]:
!python -V

Python 3.6.1 :: Anaconda 4.4.0 (64-bit)


Download and Read Raw Python Files into Pandas Dataframe

In [3]:
%%time

df = pd.concat([pd.read_csv(f'https://storage.googleapis.com/kubeflow-examples/code_search/raw_data/00000000000{i}.csv') for i in range(1)], axis=1)

df['nwo'] = df['repo_path'].apply(lambda r: r.split()[0])
df['path'] = df['repo_path'].apply(lambda r: r.split()[1])
df.drop('repo_path', axis=1, inplace=True)
df = df[['nwo', 'path', 'content']]
df.head()
df = df.truncate(after=999)

Wall time: 1min 34s


Functions to Parse Data and Tokenize

In [4]:
def tokenize_docstring(text):
    # Apply tokenization using spacy to docstrings
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]
    
def tokenize_code(text):
    # A very basic procedure for tokennizing code strings
    return RegexpTokenizer(r'\w+').tokenize(text)

def get_function_docstring_pairs(blob):
    # Extract (function/method, docstring) pairs from a give code blob
    pairs = []
    try: 
        module = ast.parse(blob)
        classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
        functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
        for c in classes: 
            functions.extend([node for node in c.body if isinstance(node, ast.FunctionDef)])
        for f in functions: 
            source = astor.to_source(f)
            docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
            function = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source
            pairs.append((f.name,
                          f.lineno, 
                          source,
                          ' '.join(tokenize_code(function)),
                          ' '.join(tokenize_docstring(docstring.split('\n\n')[0]))
                         ))
    except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
        pass
    # print("pairs length")
    # print(len(pairs))
    return pairs

def get_function_docstring_pairs_list(blob_list):
    # Apply the above function on a list of code blobs
    return [get_function_docstring_pairs(b) for b in blob_list]

In [5]:
%%time
pairs = flattenlist(apply_parallel(get_function_docstring_pairs_list, df.content.tolist(), cpu_cores=4))

250
pool
was chunked
<class 'map'>
data was transformed
Wall time: 1min 23s


In [6]:
assert len(pairs) == df.shape[0], f'Row count mismatch. df has {df.shape[0]:,} rows; pairs has {len(pairs):,} rows.'

AssertionError: Row count mismatch. df has 1,000 rows; pairs has 4,150,352 rows.

In [None]:
pairs[1001]