In [None]:
import json
import os
from os import listdir
from os.path import isfile, join
import pickle
import gzip
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import tabulate
import numpy as np
import math

import re

from itertools import cycle, islice

# Fitting Linear Regression to the dataset 
from sklearn.linear_model import LinearRegression 
# Fitting Polynomial Regression to the dataset 
from sklearn.preprocessing import PolynomialFeatures 

from scipy.stats import pearsonr

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

In [None]:
fig_width = 8
fig_height = 5

params = {
    'axes.labelsize': 12, # fontsize for x and y labels (was 10)
    'axes.titlesize': 12,
    #'text.fontsize': 8, # was 10
    'legend.fontsize': 12, # was 10
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'text.usetex': True,
    'text.usetex': False,
    'figure.figsize': [fig_width,fig_height],
    'font.family': 'serif',
    'grid.linestyle': '--',
    'lines.linewidth': 2
}

matplotlib.rcParams.update(params)

In [None]:
data_dir = 'aggregated_data'
out_dir = 'aggregated_data/stats'


appname = 'LUBM'
data_size_dir = 'LUBM'
collections = ['schemex', 'attribute', 'type']
suffix = ''
hide_legend = False

"""
appname = 'BSBM'
data_size_dir = 'BSBM'
collections =['schemex', 'attribute', 'type']
suffix = ''
hide_legend = True

appname = 'dyldo_y2019_core'
data_size_dir = 'DyLDO-core'
collections =['schemex', 'attribute', 'type']
suffix = '-test-1_clean'
hide_legend = True

appname = 'dyldo_y2019_full'
data_size_dir = 'DyLDO-ext'
collections =['schemex', 'attribute', 'type']
suffix = ''
hide_legend = True

"""






In [None]:
curr_dir = join(data_dir, data_size_dir)
onlyfiles = [f for f in listdir(curr_dir) if isfile(join(curr_dir, f)) and f.endswith(".csv")]
onlyfiles.sort(key=natural_keys)

degree_frame = pd.DataFrame()
i = 0
for filename in onlyfiles:
    with open(os.path.join(curr_dir, filename), 'r') as f:
        df = pd.read_csv(f, sep=',')
        if 'iteration' in filename:
            iteration = filename.replace('iteration', '')
            iteration = iteration.replace('-', '')
            iteration = iteration.replace('degree.csv', '')
            iteration = iteration.replace('.gz', '')
            iteration = iteration.replace('.nq', '')
            iteration = iteration.replace('.nt', '')
            i = int(iteration)
        else:
            i = i + 1

        df['Iteration'] = i
        df = df.set_index('Iteration')
        if degree_frame.empty:
            degree_frame = df
        else:
            degree_frame = degree_frame.append(df)

#plot = degree_frame['max_degree'].plot()  
if appname == 'dyldo_y2019_core':
    degree_frame = degree_frame.drop(degree_frame.index[[21,22]])


if appname == 'dyldo_y2019_full':
    degree_frame = degree_frame.head()
    iterations = 5
    ticks = 1.0
    print('drop the beat')

plot_frame = degree_frame[['avg_degree', 'avg_indegree', 'avg_outdegree']]

plot_frame.columns = ['degree', 'in-degree', 'out-degree']

boxplot = plot_frame.boxplot() 

plt.savefig(out_dir + '/' + data_size_dir + '-degree.pdf', bbox_inches = 'tight')

In [None]:
display(degree_frame['avg_degree'].mean())
display(degree_frame['avg_degree'].std())

display(degree_frame['avg_indegree'].mean())
display(degree_frame['avg_indegree'].std())

display(degree_frame['avg_outdegree'].mean())
display(degree_frame['avg_outdegree'].std())

In [None]:
# The main correlation analysis

appnames = ['BSBM', 'dyldo_y2019_core', 'dyldo_y2019_full']
appnames = ['dyldo_y2019_core']

datasets = ['LUBM', 'BSBM', 'DyLDO-core', 'DyLDO-ext']
datasets = ['DyLDO-core']

models = ['schemex', 'attribute', 'type']
models = ['attribute']


size_frame = pd.DataFrame(columns=['dataset', 'edges', 'types', 'bytes'])

index = 0
for dataset in datasets:
    curr_dir = join(data_dir, dataset)
    onlyfiles = [f for f in listdir(curr_dir) if isfile(join(curr_dir, f)) and f.endswith(".txt")]
    onlyfiles.sort(key=natural_keys)


    temp_frame = pd.DataFrame(columns=['dataset', 'edges', 'types', 'bytes', 'degree'])
    i = 0
    for filename in onlyfiles:
        f = open(join(curr_dir,filename), 'r')
        f_degree = open(join(curr_dir,filename.replace(".txt", "-degree.csv")), 'r')
        content = f.read().split('\n')

        df = pd.read_csv(f_degree, sep=',')
        df['Iteration'] = i
        df = df.set_index('Iteration')

        types = int(content[0])
        edges = int(content[1])
        bytez = int(content[2])
        degree = df['avg_degree'].values[0]

        if 'iteration' in filename:
            iteration = filename.replace('iteration', '')
            iteration = iteration.replace('-', '')
            iteration = iteration.replace('.txt', '')
            iteration = iteration.replace('.gz', '')
            iteration = iteration.replace('.nq', '')
            iteration = iteration.replace('.nt', '')
            i = int(iteration)
        else:
            i = i + 1
        temp_frame.loc[i] = [dataset, edges, types, bytez, degree]
        

    
    for model in models:
        f_space = open(os.path.join(data_dir, appnames[index] + '_'+model+'-update-time-and-space.csv'), 'r')
        #print(f_space)
        sf = pd.read_csv(f_space, sep=',')
        sf = sf.set_index('Iteration')
        #display(sf)
        temp_frame[model+'-summarization-ratio'] = sf['Imprint links'] / sf['Schema Elements (SE)']
        temp_frame[model+'VHI'] = sf['Sec. Index Size (bytes)']
        temp_frame[model+'-compression'] = temp_frame[model+'VHI'] / temp_frame['bytes']
        if 'instances' not in temp_frame:
            temp_frame['instances'] = sf['Imprint links']
        f_time = open(os.path.join(data_dir, appnames[index] + '_'+model+'-performance.csv'), 'r')
        #print(f_time)
        tf = pd.read_csv(f_time, sep=',')
        tf = tf.set_index('Iteration')
        #display(tf)
        temp_frame[model+'schema-computation'] = tf['Schema Computation']
        temp_frame[model+'update'] = tf['Updates']
        temp_frame[model+'incremental'] = tf['Total']
        temp_frame[model+'batch'] = tf['Batch']
        temp_frame[model+'speed-up'] = tf['Batch'] / tf['Total']

        
        f_change = open(os.path.join(data_dir, appnames[index] + '_'+model+'-changes.csv'), 'r')
        cf = pd.read_csv(f_change, sep=',')
        temp_frame[model+':changes'] = cf['ChangedSchemaStructures (SE_mod)'] +  ((sf['Imprint links'] - sf['Imprint links'].shift(1,fill_value=0)).abs())
        temp_frame[model+':updates'] = cf['NewlyObservedSchema (SE_new)'] + cf['DeletedSchemaStructures (SE_del)']
        temp_frame[model+':change-update-ratio'] = temp_frame[model+':changes']/(temp_frame[model+':updates'] + 1)
        #attribute:changes
        temp_frame[model+':change-size-ratio'] = temp_frame[model+':changes']/(temp_frame['instances'])
        temp_frame[model+':update-size-ratio'] = temp_frame[model+':updates']/(sf['Schema Elements (SE)'])

    if dataset == 'DyLDO-core':
        temp_frame = temp_frame.drop(temp_frame.index[[20,21]])
    if dataset == 'DyLDO-ext':
        temp_frame = temp_frame.head(5)
        
    if size_frame.empty:
        size_frame = temp_frame
    else:
        size_frame = size_frame.append(temp_frame)
        
  
    index = index + 1

size_frame.to_csv(out_dir + '/' + 'correlation-stats.csv', sep=',', encoding='utf-8')

display(size_frame.head())
for model in models:
    print("Summarization Ratio ({}): ".format(model))
    print("\tMean:\t {}".format(size_frame[model+'-summarization-ratio'].mean()))
    print("\tStd:\t {}".format(size_frame[model+'-summarization-ratio'].std()))
    print("\tMin:\t {}".format(size_frame[model+'-summarization-ratio'].min()))
    print("\tMAX:\t {}".format(size_frame[model+'-summarization-ratio'].max()))

    print("Change-Update Ratio ({}): ".format(model))
    print("\tMean:\t {}".format(size_frame[model+':change-update-ratio'].mean()))
    print("\tStd:\t {}".format(size_frame[model+':change-update-ratio'].std()))
    print("\tMin:\t {}".format(size_frame[model+':change-update-ratio'].min()))
    print("\tMAX:\t {}".format(size_frame[model+':change-update-ratio'].max()))
    
    print("Incremental Time ({}): ".format(model))
    print("\tMean:\t {}".format(size_frame[model+'incremental'].mean()))
    print("\tStd:\t {}".format(size_frame[model+'incremental'].std()))
    print("\tMin:\t {}".format(size_frame[model+'incremental'].min()))
    print("\tMAX:\t {}".format(size_frame[model+'incremental'].max()))
    
    print("Batch Time ({}): ".format(model))
    print("\tMean:\t {}".format(size_frame[model+'batch'].mean()))
    print("\tStd:\t {}".format(size_frame[model+'batch'].std()))
    print("\tMin:\t {}".format(size_frame[model+'batch'].min()))
    print("\tMAX:\t {}".format(size_frame[model+'batch'].max()))
    
    print("Speed-up Time ({}): ".format(model))
    print("\tMean:\t {}".format(size_frame[model+'speed-up'].mean()))
    print("\tStd:\t {}".format(size_frame[model+'speed-up'].std()))
    print("\tMin:\t {}".format(size_frame[model+'speed-up'].min()))
    print("\tMAX:\t {}".format(size_frame[model+'speed-up'].max()))
    print("-----------")
    
# display(size_frame)
# print('schemex')
# display(size_frame['attribute-compression'])
# print(size_frame['schemex-compression'].mean())
# print(size_frame['schemex-compression'].std())

# print('attribute')
# print(size_frame['attribute-compression'].mean())
# print(size_frame['attribute-compression'].std())

# print('type')
# print(size_frame['type-compression'].mean())
# print(size_frame['type-compression'].std())

In [None]:
x_values = ['attribute:change-size-ratio'] #'change-ratio'
x_values = ['attribute:update-size-ratio'] #'change-ratio'

y_value = ['attributespeed-up']


#x_values = ['edges'] #'change-ratio'
#y_value = ['vertexHashIndex']

X = size_frame[x_values].values # , 'change-ratio'
y = size_frame[y_value].values

#display(X)

lin = LinearRegression() 
a = [v[0] for v in X]
#b = [v[1] for v in X]
#display(a)
lin.fit(X, y) 
display(lin.coef_)
# Visualising the Linear Regression results 
plt.scatter(a, y, color = 'blue') 
#plt.scatter(b, y, color = 'green') 

plt.plot(X, lin.predict(X), color = 'red') 
plt.title('Linear Regression') 
plt.xlabel(x_values) 
plt.ylabel(y_value) 

plt.show() 
lin.score(X, y)

In [None]:
print("Do edges correlate with schema-computation?")
corrtest = pearsonr(size_frame['edges'], size_frame['attributeschema-computation'])  
display(corrtest)

In [None]:
#tmp = size_frame[['edges', 'schemexschema-computation', 'attributeschema-computation', 'typeschema-computation']]
# display(tmp.head())
# display(tmp.astype('float64').corr())

"""
Build a frame for all three summary models:
"""

full_frame = pd.DataFrame(columns=['vertices', 'edges', 'vertexHashIndex', 'schema-computation'])
full_frame['vertices'] = size_frame['instances'].append(size_frame['instances']).append(size_frame['instances'])
full_frame['edges'] = size_frame['edges'].append(size_frame['edges']).append(size_frame['edges'])
full_frame['vertexHashIndex'] = size_frame['schemexVHI'].append(size_frame['attributeVHI']).append(size_frame['typeVHI'])
full_frame['schema-computation'] = size_frame['schemexschema-computation'].append(size_frame['attributeschema-computation']).append(size_frame['typeschema-computation'])
display(full_frame)

print("Do edges correlate with schema-computation?")
corrtest = pearsonr(full_frame['edges'], full_frame['schema-computation'])  
display(corrtest)


print("Do vertices correlate with vertexHashIndex?")
corrtest = pearsonr(full_frame['vertices'], full_frame['vertexHashIndex'])  
display(corrtest)



# print("Do edges correlate with vertexHashIndex?")
# corrtest = pearsonr(full_frame['edges'], full_frame['vertexHashIndex'])  
# display(corrtest)
# corrtest = pearsonr(size_frame['edges'], size_frame['attributeschema-computation'])  
# display(corrtest)

# corrtest = pearsonr(size_frame['edges'], size_frame['typeschema-computation'])  
# display(corrtest)

display(full_frame['vertexHashIndex'].mean())

In [None]:
x_values = ['edges'] #'change-ratio'
y_value = ['schema-computation']


x_values = ['vertices'] #'change-ratio'
y_value = ['vertexHashIndex']

#x_values = ['edges'] #'change-ratio'
#y_value = ['vertexHashIndex']

X = full_frame[x_values].values # , 'change-ratio'
y = full_frame[y_value].values

#display(X)

lin = LinearRegression() 
a = [v[0] for v in X]
#b = [v[1] for v in X]
#display(a)
lin.fit(X, y) 
display(lin.coef_)
# Visualising the Linear Regression results 
plt.scatter(a, y, color = 'blue') 
#plt.scatter(b, y, color = 'green') 

plt.plot(X, lin.predict(X), color = 'red') 
plt.title('Linear Regression') 
plt.xlabel(x_values) 
plt.ylabel(y_value) 

plt.show() 
lin.score(X, y)


In [None]:
change_frame = pd.DataFrame(columns=['SE-ADD', 'SE-DEL','SE-MOD', 'SE-unchanged', 
                                     'instances', 'stability-ratio', 'change-ratio', 
                                     'update-ratio'])
iterations = 0

for collection in collections:
    with open(os.path.join(data_dir, appname + '_'+collection+suffix+'-changes.csv'), 'r') as f:
        df = pd.read_csv(f, sep=',')
        
        iterations = max(iterations, len(df))
        f2 = open(os.path.join(data_dir, appname + '_'+collection+suffix+'-update-time-and-space.csv'), 'r')
        df2 = pd.read_csv(f2, sep=',')
        df2.head()
        
        df2['Imprint links add-del'] = (df2['Imprint links'] - df2['Imprint links'].shift(1,fill_value=0)).abs()

        
        del df['TotalNumberOfNewInstances'] 
        del df['InstanceAddedWithKnownSchema (PE_add)']
        del df['InstancesDeleted (PE_del)']
        del df['ChangedSchemaStructuresBecauseOfNeighbor'] 
        del df['PayloadEntriesAdded'] 
        del df['PayloadEntriesRemoved'] 
        del df['InstanceToSchemaLinksAdded'] 
        del df['InstanceToSchemaLinksRemoved'] 
        del df['TotalNumberOfChangedPayloadElements (real PE_mod)']
        del df['TotalNumberOfSchemaElementsWritten']

        del df['TotalNumberOfSchemaElementsDeleted']
        df = df.set_index('Iteration')
        
        df.columns = ['SE-ADD', 'SE-DEL','SE-MOD', 'SE-unchanged']
        df['instances'] = df2['Imprint links']
        df['stability-ratio'] = df['SE-unchanged'] / df['instances']
        df['add_del-ratio'] = (df['instances'] - df['SE-MOD'] - df['SE-unchanged'] ) / df['instances']

        
        change_frame.loc[collection+':min'] = df.min()
        change_frame.loc[collection+':max'] = df.max()
        change_frame.loc[collection+':mean'] = df.mean()
        change_frame.loc[collection+':std'] = df.std()
        

change_frame.to_csv(out_dir + '/' + appname + '-stats-changes.csv', sep=',', encoding='utf-8')
display(change_frame)

In [None]:
performance_frame = pd.DataFrame(columns=['Load Graph', 'Parse Graph', 'Partition Graph', 'Schema Computation',
       'Updates', 'Total', 'Batch', 'Seepdup'])

for collection in collections:
    with open(os.path.join(data_dir, appname + '_'+collection+suffix+'-performance.csv'), 'r') as f:
        df = pd.read_csv(f, sep=',')
        df.head()

        df = df.set_index('Iteration')
        print(collection)
#         del df['Load Graph']
#         del df['Parse Graph']
#         del df['Partition Graph']
#         del df['Schema Computation']
#         del df['Updates']
#        df.columns = [collection + '-Inc', collection + '-Batch']    
        convert = lambda x: x / 1000 / 60
        df = df.applymap(convert)  
        df['Seepdup'] = df.Batch / df.Total
        display(df)
        performance_frame.loc[collection+':min'] = df.min()
        performance_frame.loc[collection+':max'] = df.max()
        performance_frame.loc[collection+':mean'] = df.mean()
        performance_frame.loc[collection+':std'] = df.std()
display(performance_frame)
performance_frame.to_csv(out_dir + '/' + appname + '-stats-performance.csv', sep=',', encoding='utf-8')

    
    
# df = pd.concat(frames, sort=False)
# df.columns = ['SchemEX:Incr', 'SchemEX:Batch', 
#              'AttrColl:Incr', 'AttrColl:Batch',
#              'TypeColl:Incr', 'TypeColl:Batch']
# print(df)
    


In [None]:
curr_dir = join(data_dir, data_size_dir)
onlyfiles = [f for f in listdir(curr_dir) if isfile(join(curr_dir, f)) and f.endswith(".txt")]
onlyfiles = sorted(onlyfiles)

edgecounts = {}
tmp = 0
for filename in onlyfiles:
    # first line: number of types in dataset (not unique)
    # second line: number of edges in dataset (not unique)
    f = open(join(curr_dir,filename), 'r')
    content = f.read().split('\n')
    if len(content) > 2:
        edges = int(content[1])
    else:
        edges = int(content[0])
    if 'iteration' in filename:
        iteration = filename.replace('iteration', '')
        iteration = iteration.replace('-', '')
        iteration = iteration.replace('.txt', '')
        iteration = iteration.replace('.gz', '')
        iteration = iteration.replace('.nq', '')
        iteration = iteration.replace('.nt', '')
               
        edgecounts[int(iteration)] = edges
    else:
        edgecounts[tmp] = edges
        tmp = tmp + 1
        
edgeframe = pd.DataFrame.from_dict(edgecounts, orient='index')
edgeframe = pd.DataFrame.sort_index(edgeframe)
#display(edgeframe)


In [None]:
size_frame = pd.DataFrame(columns= ['SE links', 'Imprint links', 'Schema Elements (SE)',
       'Schema Relations (SR)', 'summarization-ratio'])

for collection in collections:
    with open(os.path.join(data_dir, appname + '_'+collection+suffix+'-update-time-and-space.csv'), 'r') as f:
        df = pd.read_csv(f, sep=',')
        df.head()

        del df['SecondaryIndex Read time (ms)']
        del df['SecondaryIndex Write time (ms)']
        del df['SecondaryIndex Del time (ms)']
        #--->
        del df['SecondaryIndex Total time (ms)']
        #<----
        # number of primary vertices
        #del df['SE links']
#         if not size_frame.empty:
#             del df['Imprint links']
        del df['Checksum links']
        #del df['Schema Elements (SE)']
        #del df['Schema Relations (SR)']

        del df['SG Read time (ms)']
        del df['SG Write time (ms)']
        del df['SG Del time (ms)']

        del df['Sec. Index Size (bytes)']
        del df['Index Size (bytes)']
        del df['Graph Size (bytes)']
        #print(df)
        df = df.set_index('Iteration')
        print(df.columns)
        size_frame.loc[collection+':min'] = df.min()
        size_frame.loc[collection+':max'] = df.max()
        size_frame.loc[collection+':mean'] = df.mean()
        size_frame.loc[collection+':std'] = df.std()
        size_frame['summarization-ratio'] = size_frame['Imprint links'] / size_frame['SE links']
        


display(size_frame)
size_frame.to_csv(out_dir + '/' + appname + '-stats-size.csv', sep=',', encoding='utf-8')

In [None]:
correlation_frame = pd.DataFrame()
data_dir = "test"
collections = ['schemex', 'type', 'attribute']#,] #, 'type', 'attribute'] # 'type' 'attribute' 
#collections = ['schemex']

appnames = ["LUBM","BSBM","dyldo_y2019_core"] # "dyldo_y2019_full", #"LUBM", "BSBM"
#appnames = ["BSBM"]#, "LUBM", "BSBM"] # "dyldo_y2019_full", #"LUBM", "BSBM"

suffix = ''
for appname in appnames:
    for collection in collections:
        with open(os.path.join(data_dir, appname + '_'+collection+suffix+'-performance.csv'), 'r') as f:
            
            df = pd.read_csv(f, sep=',')
            display(df.head())
            temp_frame = pd.DataFrame()
            #temp_frame['Iteration'] = df['Iteration']
            #df = df.set_index('Iteration')

            #temp_frame = temp_frame.set_index('Iteration')

            temp_frame['schema-computation'] = df['Schema Computation']
            temp_frame['performance'] = df.Total
            #temp_frame['speed-up'] = df.Batch / df.Total
            temp_frame['updates'] = df['Updates']
            f2 = open(os.path.join(data_dir, appname + '_'+collection+suffix+'-changes.csv'), 'r')
            df2 = pd.read_csv(f2, sep=',')
            #df2 = df2.set_index('Iteration')

            display(df2)


            f3 = open(os.path.join(data_dir, appname + '_'+collection+suffix+'-update-time-and-space.csv'), 'r')
            df3 = pd.read_csv(f3, sep=',')
            df3 = df3.set_index('Iteration')
            temp_frame['size'] = df3['Imprint links']
            df['instances'] = df3['Imprint links']
            df['SE-unchanged'] = df2['InstanceNotChanged (PE_mod)']

            #display(df)
            #display(degree_frame['max_degree'])
            temp_frame['max-degree'] = degree_frame['max_degree']
            temp_frame['avg-degree'] = degree_frame['avg_degree']

            temp_frame['change-ratio'] = ((df3['Imprint links'] - df2['InstanceNotChanged (PE_mod)']) / df3['Imprint links'])
            temp_frame['update-ratio'] = (df2['NewlyObservedSchema (SE_new)'] + df2['DeletedSchemaStructures (SE_del)']) / df3['Schema Elements (SE)']
            temp_frame['total-changes'] = (df3['Imprint links'] - df2['InstanceNotChanged (PE_mod)'])
            temp_frame['total-updates'] = (df2['NewlyObservedSchema (SE_new)'] + df2['DeletedSchemaStructures (SE_del)'])
            temp_frame['summary-size'] = df3['Schema Elements (SE)']
#             temp_frame['change-ratio'] = 1 - (df2['InstanceNotChanged (PE_mod)'] / df3['Imprint links'].shift(1))
            temp_frame['summarization-ratio'] =  df3['Imprint links']/ df3['SE links'].shift(1)
#             temp_frame['summary-mod-ratio'] = (df2['TotalNumberOfSchemaElementsWritten'] + df2['TotalNumberOfSchemaElementsDeleted']) /(1 * df3['Schema Elements (SE)'].shift(1)
#)# 
            # test new summary graphs
            temp_frame['summary-adds'] = df2['NewlyObservedSchema (SE_new)']
            
            temp_frame['add-vs-updates'] = df2['NewlyObservedSchema (SE_new)'] / df2['TotalNumberOfSchemaElementsWritten']
            temp_frame['del-vs-update'] = df2['DeletedSchemaStructures (SE_del)'] / df2['TotalNumberOfSchemaElementsDeleted']
            
            #display(temp_frame)
            if correlation_frame.empty:
                correlation_frame = temp_frame.iloc[1:]
            else:
                correlation_frame = correlation_frame.append(temp_frame.iloc[1:])

                

correlation_frame['combined'] = correlation_frame['size'] * correlation_frame['summarization-ratio']            

#display(degree_frame)
#correlation_frame['ssss'] = degree_frame['avg_degree']
correlation_frame['add-time'] = correlation_frame['summary-adds'] / correlation_frame['updates']            

display(correlation_frame)

print(correlation_frame['add-time'].mean())

In [None]:
correlation_frame

In [None]:
print('ms per addition')
print(correlation_frame['add-time'].mean())
print(correlation_frame['add-time'].std())

In [None]:
x_values = ['size'] #'change-ratio'
y_value = ['schema-computation']
X = correlation_frame[x_values].values # , 'change-ratio'
y = correlation_frame[y_value].values

#display(X)

lin = LinearRegression() 
a = [v[0] for v in X]
#b = [v[1] for v in X]
#display(a)
lin.fit(X, y) 
display(lin.coef_)
# Visualising the Linear Regression results 
plt.scatter(a, y, color = 'blue') 
#plt.scatter(b, y, color = 'green') 

plt.plot(X, lin.predict(X), color = 'red') 
plt.title('Linear Regression') 
plt.xlabel(x_values) 
plt.ylabel(y_value) 

plt.show() 
lin.score(X, y)

In [None]:
# Total berechnung hängt vom Datensatz ab, nicht nur von der Größe (param = 1)
# Change ratio nicht unbedingt


In [None]:
X = correlation_frame.iloc[:, 4:5].values 
y = correlation_frame.iloc[:, 0].values 
lin = LinearRegression() 

lin.fit(X, y) 
# Visualising the Linear Regression results 
plt.scatter(X, y, color = 'blue') 

plt.plot(X, lin.predict(X), color = 'red') 
plt.title('Linear Regression') 
plt.xlabel(correlation_frame.columns[4]) 
plt.ylabel(correlation_frame.columns[0]) 

plt.show() 

In [None]:
X = correlation_frame.iloc[:, 3:4].values 
y = correlation_frame.iloc[:, 0].values 
lin = LinearRegression() 

lin.fit(X, y) 
# Visualising the Linear Regression results 
plt.scatter(X, y, color = 'blue') 

plt.plot(X, lin.predict(X), color = 'red') 
plt.title('Linear Regression') 
plt.xlabel('Combined-factor ratio') 
plt.ylabel('Speed-up') 


plt.show() 