In [5]:
import os
import json
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_rows', 200)

notebook_home = os.path.abspath('')
#dataset_home =notebook_home + "/../output_enron"
dataset_home =notebook_home + "/../output_github"

numFunctionsPerCase = "numFunctionsPerCase.csv"
functionFreq = "functionFreq.csv"
funcFreqCaseOne = "funcFreqCaseOne.csv"
funcFreqCaseTwo = "funcFreqCaseTwo.csv"
funcFreqCaseThree = "funcFreqCaseThree.csv"
funcFreqCaseFour = "funcFreqCaseFour.csv"
numFunctionsFreq = "numFunctionsFreq.csv"
caseTwoRowsApart = "caseTwoRowsApart.csv"
perCaseStats = "perCaseStats.csv"

### The number of functions for each case (i.e., single cell, multiple cells, etc.)

In [6]:
df = pd.read_csv(os.path.join(dataset_home, numFunctionsPerCase), header=None, names=['Case', 'NumFunctions'])
df

Unnamed: 0,Case,NumFunctions
0,0,7756
1,1,80591
2,2,181596
3,3,25203
4,4,7058


### The number of times each function is used in a template

In [7]:
df = pd.read_csv(os.path.join(dataset_home, functionFreq), header=None, names=['Function', 'Number'])
sorted_df = df.sort_values(by='Number', ascending=False).reset_index(drop=True)
sorted_df

Unnamed: 0,Function,Number
0,+,75636
1,-,56759
2,*,37759
3,/,34813
4,AVERAGE,13341
5,&,12529
6,SUM,5971
7,MAX,5680
8,MIN,5263
9,VLOOKUP,5226


### The number of times each function is used in a template for each case

In [8]:
# Case One: one cell
df = pd.read_csv(os.path.join(dataset_home, funcFreqCaseOne), header=None, names=['Function', 'Number'])
sorted_df = df.sort_values(by='Number', ascending=False).reset_index(drop=True)
sorted_df

Unnamed: 0,Function,Number
0,+,13514
1,/,13102
2,*,10308
3,&,7645
4,-,6789
5,LEN,4141
6,VLOOKUP,3304
7,ISNUMBER,2670
8,SEARCH,2308
9,ABS,1113


In [9]:
# Case Two: multiple cells
df = pd.read_csv(os.path.join(dataset_home, funcFreqCaseTwo), header=None, names=['Function', 'Number'])
sorted_df = df.sort_values(by='Number', ascending=False).reset_index(drop=True)
sorted_df

Unnamed: 0,Function,Number
0,+,61957
1,-,49919
2,*,27317
3,/,20662
4,SUM,5659
5,&,4822
6,SUBSTITUTE,1625
7,MOD,1561
8,CONCATENATE,1286
9,AVERAGE,1057


In [10]:
# Case Three: a single range
df = pd.read_csv(os.path.join(dataset_home, funcFreqCaseThree), header=None, names=['Function', 'Number'])
sorted_df = df.sort_values(by='Number', ascending=False).reset_index(drop=True)
sorted_df

Unnamed: 0,Function,Number
0,AVERAGE,11481
1,MAX,5536
2,MIN,5171
3,COUNTIF,1369
4,ABS,303
5,TRANSPOSE,270
6,LARGE,249
7,COUNT,201
8,COUNTA,114
9,STDEV,94


In [11]:
# Case Four: multiple ranges
df = pd.read_csv(os.path.join(dataset_home, funcFreqCaseFour), header=None, names=['Function', 'Number'])
sorted_df = df.sort_values(by='Number', ascending=False).reset_index(drop=True)
sorted_df

Unnamed: 0,Function,Number
0,SUMIF,1886
1,VLOOKUP,1766
2,RANK,490
3,COUNTIF,455
4,INDEX,433
5,MMULT,430
6,MATCH,392
7,HLOOKUP,337
8,SUM,288
9,SUMPRODUCT,161


### The distribution for the number of functions that a formula template has

In [12]:
df = pd.read_csv(os.path.join(dataset_home, numFunctionsFreq), header=None, names=['Num of Functions', 'Instances'])
sorted_df = df.sort_values(by='Num of Functions', ascending=True).reset_index(drop=True)
sorted_df

Unnamed: 0,Num of Functions,Instances
0,0,33493
1,1,140701
2,2,32037
3,3,5461
4,4,6517
5,5,1691
6,6,721
7,7,731
8,8,517
9,9,554


### Case Two Statistics

In [13]:
# The distribution for the number of rows apart for the cells 
# that are referenced by a function and follow the RR pattern
df = pd.read_csv(os.path.join(dataset_home, caseTwoRowsApart), header=None, names=['Num of Rows Apart', 'Instances'])
sorted_df = df.sort_values(by='Num of Rows Apart', ascending=True).reset_index(drop=True)
sorted_df

Unnamed: 0,Num of Rows Apart,Instances
0,0,79788
1,1,12165
2,2,127
3,3,36
4,4,14
5,5,4
6,6,3
7,7,59
8,8,1
9,9,12


In [14]:
# The number of functions that reference an intermediate cell
df_case = pd.read_csv(os.path.join(dataset_home, numFunctionsPerCase), header=None, names=['Case', 'NumFunctions'])
df = pd.read_csv(os.path.join(dataset_home, perCaseStats), header=None, names=['Case', 'Num for Intermediate'])
df = pd.merge(df, df_case, on='Case')
df.loc[df["Case"]==2, ['NumFunctions', 'Num for Intermediate']]

Unnamed: 0,NumFunctions,Num for Intermediate
0,181596,69658


### Case Three Statistics

In [15]:
# The number of functions that reference a single column
df = pd.read_csv(os.path.join(dataset_home, perCaseStats), header=None, names=['Case', 'Num for Single Column'])
df = pd.merge(df, df_case, on='Case')
df.loc[df["Case"]==3, ['NumFunctions', 'Num for Single Column']]

Unnamed: 0,NumFunctions,Num for Single Column
1,25203,979
