In [1]:
import json
import re

import pandas as pd
import numpy as np

from methods_parse.methods_parse import UsedMethods, shift_methods

In [2]:
# comp_path = '../../data_mini/competitions_info_cleaned.csv'
# competitions = pd.read_csv(comp_path)
# competitions.shape

In [3]:
graph_path = '../../data/actual_graph_2021-04-18.csv'
graph = pd.read_csv(graph_path)
graph.rename({'id':'graph_vertex_id'}, axis=1, inplace=True)
# graph.head()

In [4]:
markup_path = '../../data/markup_data_2021-04-18.csv'
markup_data = pd.read_csv(markup_path)
markup_data.shape

(2610, 8)

In [5]:
markup_data = markup_data.merge(graph, on='graph_vertex_id', how='left')

---
For Row

In [6]:
def remove_comments(code_block:str) -> str:
    
    return cleaned_code_block

In [8]:
def label_code_by_regex(code_block, regex_graph_version:int=6):
    regex_graph_dir = '../../graph/graph_v{}.txt'.format(regex_graph_version)
    with open(regex_graph_dir, "r") as regex_graph_file:
        regex_graph = json.load(regex_graph_file)
    found_vertices = []
    for i, vertex in enumerate(regex_graph):
        tokens = regex_graph[vertex]
        for token in tokens:
            result = re.search(token.replace('(','\('), code_block)
            if result!=None:
                found_vertices.append(vertex)
                break
    return found_vertices

In [9]:
def get_comments(text: str):
    text = str(text)
#     comments = comment_parser.extract_comments_from_str(text, mime="text/x-python")
#     return "\n".join([line.text() for line in comments])

    comments = []
    line_array = [line.strip() for line in text.split("\n")]
    # now get libraries if line start with "#"
    for line_ind in range(len(line_array)):
        if line_array[line_ind].startswith("#"):
            comments.append(line_array[line_ind][1:])
        elif line_array[line_ind].startswith("'''"):
            multi_comm = str()
            multi_comm += line_array[line_ind][3:] + "\n"
            if "'''" in multi_comm:
                multi_comm = multi_comm.replace("'''", "")
                line_ind += 1
                comments.append(multi_comm)
                continue
            line_ind += 1
            while line_ind < len(line_array):
                multi_comm += line_array[line_ind] + "\n"
                if "'''" in multi_comm:
                    multi_comm = multi_comm.replace("'''", "")
                    line_ind += 1
                    break
                line_ind += 1
            comments.append(multi_comm)
        elif line_array[line_ind].startswith('"""'):
            multi_comm = str()
            multi_comm += line_array[line_ind][3:] + "\n"
            if '"""' in multi_comm:
                multi_comm = multi_comm.replace('"""', "")
                line_ind += 1
                comments.append(multi_comm)
                continue
            line_ind += 1
            while line_ind < len(line_array):
                multi_comm += line_array[line_ind] + "\n"
                if '"""' in multi_comm:
                    multi_comm = multi_comm.replace('"""', "")
                    line_ind += 1
                    break
                line_ind += 1
            comments.append(multi_comm)
            pass
    return "\n".join([line for line in comments])

In [10]:
def get_libraries(text: str, dicts: tuple):
    name_lib_import = dicts[0]
    name_lib_from = dicts[1]
    text = str(text)
    libs = []
    line_array = [line.strip() for line in text.split("\n")]
    # delete all double spaces in all lines
    for ind in range(len(line_array)):
        
        prev_len = -1
        while prev_len != len(line_array[ind]):
            prev_len = len(line_array[ind])
            line_array[ind] = line_array[ind].replace("  ", " ")
    # now get libraries if line start with "import" or "from"
    for line in line_array:
        if line.startswith("import") or line.startswith("from"):
            libs.append(line.split(" ")[1].split(".")[0])
    for name in name_lib_from.keys():
        for line in line_array:
            if name in line:
                libs.append(name_lib_from[name])
                
    for name in name_lib_import.keys():
        for line in line_array:
            if name + "." in line:
                libs.append(name_lib_import[name])
        
    return "\n".join(list(set(libs)))

In [11]:
def get_dict(str_arr):
    name_lib_import = {}
    name_lib_from = {}
    str_arr = [[line.strip() for line in text.split("\n")] for text in str_arr]
    # delete all double spaces in all lines
    for line_array in str_arr:
        for ind in range(len(line_array)):
            prev_len = -1
            while prev_len != len(line_array[ind]):
                prev_len = len(line_array[ind])
                line_array[ind] = line_array[ind].replace("  ", " ")
    # now get libraries if line start with "import" or "from"
    for line_array in str_arr:
        for line in line_array:
            if line.startswith("import"):
                split_arr = line.split(" ")
                # if we have "import ... as ...""
                if len(split_arr) > 2 and split_arr[2] == "as":
                    name_lib_import[split_arr[-1]] = split_arr[1].split(".")[0]
            
            if line.startswith("from"):
                split_arr = line.split(" ")
                # if we have "from ... import ...""
                if len(split_arr) > 2 and split_arr[2] == "import":
                    name_lib_from[split_arr[-1]] = split_arr[1].split(".")[0]
    return (name_lib_import, name_lib_from)

In [12]:
def process_row(row):
    '''
    from methods_parse.py: python_methods, python_methods_m1, python_methods_m2, python_methods_m3, python_methods_p1, python_methods_p2, python_methods_p3
    from models_scripts/regex.py: graph_vertex_regexs
    '''
    row['python_methods'] = UsedMethods(row['code_block'])
    row['graph_vertex_regex'] = label_code_by_regex(row['code_block'])
    row['comments'] = get_comments(row['code_block'])
    return row

In [13]:
markup_data = markup_data.apply(process_row, axis=1)

---
For Dataset

In [14]:
def get_libraries_df(df):
    all_temp_dfs = []
    notebooks_ids = df['kaggle_id'].tolist()
    for not_id in notebooks_ids:
        # print('notebook id {}'.format(not_id))
        # get rows for one kaggle_id
        temp_df = df[df['kaggle_id'] == not_id]
        buf_graph_vertex = list(temp_df.graph_vertex.values)
        dicts = get_dict(temp_df.code_block.values)
    #     print(dicts)
        buf_arr = []
        for code in temp_df.code_block.values:
    #         print(get_libraries(code, dicts))
            buf_arr.append(get_libraries(code, dicts))
        temp_df['libraries'] = buf_arr
        all_temp_dfs.append(temp_df)
    return pd.concat(all_temp_dfs)

In [15]:
markup_data = get_libraries_df(markup_data)

g to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['libraries'] = buf_arr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['libraries'] = buf_arr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['libraries'] = buf_arr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See 

In [17]:
markup_data = shift_methods(markup_data, shift_range=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['python_methods_m{}'.format(i)][i:] = df['python_methods'][:-i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['python_methods_p{}'.format(i)][:-i] = df['python_methods'][i:]


In [18]:
markup_data.head(25)

Unnamed: 0,code_block_id,code_block,data_format,graph_vertex_id,errors,marks,kaggle_id,competition_id,graph_vertex,graph_vertex_subclass,...,python_methods_m1,python_methods_p1,python_methods_m2,python_methods_p2,python_methods_m3,python_methods_p3,python_methods_m4,python_methods_p4,python_methods_m5,python_methods_p5
0,570367,`# My forecasting COVID-19 confirmed cases and...,Table,45,No,2,8591010,4368,Data_Extraction,load_from_csv,...,,"[plotting, walk, path, join]",,[read_csv],,[],,"[plotting, walk, path, join]",,[read_csv]
1,570367,`# My forecasting COVID-19 confirmed cases and...,Table,45,No,2,8591010,4368,Data_Extraction,load_from_csv,...,"[plotting, walk, path, join]",[read_csv],,[],,"[plotting, walk, path, join]",,[read_csv],,[]
2,570368,`# load training and testing data \nsubm = pd....,Table,45,No,5,8591010,4368,Data_Extraction,load_from_csv,...,"[plotting, walk, path, join]",[],"[plotting, walk, path, join]","[plotting, walk, path, join]",,[read_csv],,[],,[]
3,570369,`subm`,Table,41,No,5,8591010,4368,EDA,show_table,...,[read_csv],"[plotting, walk, path, join]","[plotting, walk, path, join]",[read_csv],"[plotting, walk, path, join]",[],,[],,[]
4,570367,`# My forecasting COVID-19 confirmed cases and...,Table,45,No,2,8591010,4368,Data_Extraction,load_from_csv,...,[],[read_csv],[read_csv],[],"[plotting, walk, path, join]",[],"[plotting, walk, path, join]",[],,[describe]
5,570368,`# load training and testing data \nsubm = pd....,Table,45,No,5,8591010,4368,Data_Extraction,load_from_csv,...,"[plotting, walk, path, join]",[],[],[],[read_csv],[],"[plotting, walk, path, join]",[describe],"[plotting, walk, path, join]",[describe]
6,570369,`subm`,Table,41,No,5,8591010,4368,EDA,show_table,...,[read_csv],[],"[plotting, walk, path, join]",[],[],[describe],[read_csv],[describe],"[plotting, walk, path, join]",[describe]
7,570370,`# see testing data\ntest_data`,Table,41,No,5,8591010,4368,EDA,show_table,...,[],[],[read_csv],[describe],"[plotting, walk, path, join]",[describe],[],[describe],[read_csv],[describe]
8,570371,`# ...and training data\ntrain_data`,Table,41,No,5,8591010,4368,EDA,show_table,...,[],[describe],[],[describe],[read_csv],[describe],"[plotting, walk, path, join]",[describe],[],[shape]
9,570372,`train_data.describe()`,Table,40,No,5,8591010,4368,EDA,show_table_attributes,...,[],[describe],[],[describe],[],[describe],[read_csv],[shape],"[plotting, walk, path, join]",[shape]
