The goal of this notebook is to make statistics on the data that is provided in the OMG implementation.

In [10]:
import sys
import os

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path
sys.path.append(parent_dir)

In [11]:

import csv
import pandas as pd
import plotly.express as px
from utilities import read_files
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [1]:
def get_file_length(file):
    with open(file, 'r', encoding="utf8") as f:
        return len(f.readlines())
    
def get_number_of_keyword(file, keyword):
    with open(file, 'r', encoding="utf8") as f:
        return len([line for line in f.readlines() if keyword.lower() in line.lower()])

The following cell doesn't have to be ran everytime, it can only be ran once to generate the data.csv file.

In [17]:
files = read_files("../data/examples")

# Keywords is a file of all the SysMLV2 components we are interested on.
keywords = open("keywords.txt", "r").read().split("\n")[:-1]

save_file = csv.writer(open("data.csv", "w", newline=''))
save_file.writerow(["file", "length"] + keywords)

for file in files:
    save_file.writerow([file, get_file_length(file)] + [get_number_of_keyword(file, keyword.replace(" ", "")) for keyword in keywords])

If you already have data.csv, you can move here directly

In [20]:
df = pd.read_csv("data.csv")

# file_short is the name of the file without the path
df['file_short'] = df['file'].str.split('/').str[-1]


In [26]:
for keyword in keywords:
    print("num of occurence of", keyword, ":", len(df[df[keyword] > 0]))

num of occurence of import  : 60
num of occurence of calc  : 8
num of occurence of package  : 86
num of occurence of action  : 31
num of occurence of in  : 41
num of occurence of out  : 19
num of occurence of part  : 72
num of occurence of part def  : 54
num of occurence of attribute   : 0
num of occurence of port  : 64
num of occurence of port def  : 16
num of occurence of interface def  : 5
num of occurence of interface  : 11
num of occurence of connect  : 14
num of occurence of allocation def  : 1
num of occurence of allocation  : 1
num of occurence of allocate  : 1
num of occurence of case  : 6
num of occurence of actor  : 1
num of occurence of subject  : 10
num of occurence of objective  : 7
num of occurence of assert  : 9
num of occurence of namespace  : 0
num of occurence of requirement def  : 8
num of occurence of stakeholder  : 1
num of occurence of analysis def  : 4
num of occurence of satisfy  : 3


In [27]:
for keyword in keywords:
    if len(df[df[keyword] > 0]) < 9:
        df = df.drop(columns=[keyword])
        
df

Unnamed: 0,file,length,import,package,action,in,out,part,part def,port,port def,interface,connect,subject,assert,file_short
0,./data/examples/Analysis Examples/AnalysisAnno...,26,3,1,1,7,3,0,0,3,0,0,0,0,0,AnalysisAnnotation.sysml
1,./data/examples/Analysis Examples/Dynamics.sysml,91,2,1,4,27,12,0,0,2,0,0,0,0,1,Dynamics.sysml
2,./data/examples/Analysis Examples/Turbojet Sta...,110,4,4,5,11,2,4,2,4,0,0,0,0,0,Turbojet Stage Analysis.sysml
3,./data/examples/Analysis Examples/Vehicle Anal...,286,14,6,3,36,7,8,3,21,1,0,0,4,2,Vehicle Analysis Demo.sysml
4,./data/examples/Arrowhead Framework Example/AH...,55,3,1,1,0,0,0,0,11,4,0,0,0,0,AHFCoreLib.sysml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,./data/examples/v1 Spec Examples/D.4.7.8 Dynam...,88,3,1,0,2,0,0,0,3,0,0,0,0,0,HSUVDynamics.sysml
85,./data/examples/Variability Examples/VehicleVa...,165,6,10,14,0,0,31,6,13,1,0,0,0,3,VehicleVariabilityModel.sysml
86,./data/examples/Vehicle Example/VehicleDefinit...,54,5,1,0,2,1,7,6,12,3,3,0,0,0,VehicleDefinitions.sysml
87,./data/examples/Vehicle Example/VehicleIndivid...,111,5,4,0,2,0,10,5,5,0,0,0,0,0,VehicleIndividuals.sysml


In [28]:
df.to_csv("data.csv", index=False)

In [29]:
fig = px.bar(df, x='file_short', y='length', log_y=True,
             title="Length of files, in number of lines",
             color_discrete_sequence=['#2e1b11'])
fig.update_layout(
    bargap=0.2,                     # Add spacing between bars
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028'         # Background of the entire figure
)
fig.show()

In [37]:
# Count occurrences of each bin
counts = df['length'].value_counts().sort_index()

# Convert to a DataFrame
counts_df = counts.reset_index()
counts_df.columns = ['length', 'count']

# Plot using a bar chart for control over sorting
fig = px.histogram(df, y='length', nbins=50, title='Distribution of File Lengths', orientation='h')
fig.update_layout(
    bargap=0.2,                     # Add spacing between bars
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028',        # Background of the entire figure
    font=dict(color='#FFFFFF'),     # Text color (white for contrast)
    title_font=dict(size=20, color='#FFFFFF'),  # Title font
    xaxis=dict(
        title_text='Frequency',        # X-axis label
        title_font=dict(size=16, color='#FFFFFF'),  # X-axis label color
        tickfont=dict(size=12, color='#FFFFFF')     # X-axis tick color
    ),
    yaxis=dict(
        title_text='Length',     # Y-axis label
        title_font=dict(size=16, color='#FFFFFF'),  # Y-axis label color
        tickfont=dict(size=12, color='#FFFFFF')     # Y-axis tick color
    )
)

fig.show()


In [45]:
attribute_counts = copy_of_df.sum()
attribute_counts = attribute_counts.sort_values()
# Create a bar plot
fig = px.bar(
    attribute_counts, 
    x=attribute_counts.index, 
    y=attribute_counts.values, 
    title='Sum of Occurrences per Attribute', 
    labels={'x': 'Attribute', 'y': 'Count'},
    color_discrete_sequence=['#009999']
)

counts_df = attribute_counts.reset_index()
counts_df.columns = ['Attribute', 'Count']

fig = px.bar(
    counts_df,
    x='Attribute',
    y='Count',
    title='Sum of Occurrences per Attribute',
    labels={'Attribute': 'Attribute', 'Count': 'Count'},
    color_discrete_sequence=['#009999']  # Set bar color to match previous plot
)

# Step 5: Update layout for better aesthetics and contrast
fig.update_layout(
    bargap=0.2,                     # Add spacing between bars
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028',        # Background of the entire figure
    font=dict(color='#FFFFFF'),     # Set text color to black for contrast
    title_font=dict(size=20, color='#FFFFFF'),  # Title font settings
    xaxis=dict(
        title_text='Attribute',
        title_font=dict(size=16, color='#FFFFFF'),
        tickfont=dict(size=12, color='#FFFFFF')
    ),
    yaxis=dict(
        title_text='Count',
        title_font=dict(size=16, color='#FFFFFF'),
        tickfont=dict(size=12, color='#FFFFFF')
    )
)
fig.show()

In [None]:
# split the files per folder
df['folder'] = df['file'].str.split('/').str[-2]

# make a pie chart of the number of files per folder
fig = px.pie(df, names='folder')
fig.update_layout(
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028', # Background of the entire figure
    font=dict(color='#FFFFFF'),     # Set text color to black for contrast
    title_font=dict(size=20, color='#FFFFFF'),  # Title font settings
    xaxis=dict(
        title_text='Attribute',
        title_font=dict(size=16, color='#FFFFFF'),
        tickfont=dict(size=12, color='#FFFFFF')
    ),
    
)
fig.show()

In [120]:
df['length'].describe()

count      90.000000
mean       77.188889
std       169.910412
min        12.000000
25%        24.250000
50%        39.500000
75%        88.000000
max      1580.000000
Name: length, dtype: float64

In [121]:
df["import"].describe()

count    90.000000
mean      2.933333
std       6.612552
min       0.000000
25%       0.000000
50%       1.000000
75%       3.000000
max      58.000000
Name: import, dtype: float64

In [122]:
df["package"].describe()

count    90.000000
mean      2.344444
std       6.052506
min       0.000000
25%       1.000000
50%       1.000000
75%       2.000000
max      57.000000
Name: package, dtype: float64