The goal of this notebook is to make statistics on the data that is provided in the OMG implementation.

In [2]:
import sys
import os

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path
sys.path.append(parent_dir)

In [3]:

import csv
import pandas as pd
import plotly.express as px
from utilities import read_files
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [4]:
def get_file_length(file):
    with open(file, 'r', encoding="utf8") as f:
        return len(f.readlines())
    
def get_number_of_keyword(file, keyword):
    with open(file, 'r', encoding="utf8") as f:
        return len([line for line in f.readlines() if keyword.lower() in line.lower()])

The following cell doesn't have to be ran everytime, it can only be ran once to generate the data.csv file.

In [7]:
files = read_files("../data/examples")

# Keywords is a file of all the SysMLV2 components we are interested on.
keywords = open("keywords.txt", "r").read().split("\n")[:-1]

save_file = csv.writer(open("data.csv", "w", newline=''))
save_file.writerow(["file", "length"] + keywords)

for file in files:
    save_file.writerow([file, get_file_length(file)] + [get_number_of_keyword(file, keyword.replace(" ", "")) for keyword in keywords])

If you already have data.csv, you can move here directly

In [8]:
df = pd.read_csv("data.csv")

# file_short is the name of the file without the path
df['file_short'] = df['file'].str.split('/').str[-1]


In [9]:
for keyword in keywords:
    print("num of occurence of", keyword, ":", len(df[df[keyword] > 0]))

num of occurence of import  : 61
num of occurence of calc  : 12
num of occurence of package  : 86
num of occurence of action  : 32
num of occurence of in  : 76
num of occurence of out  : 24
num of occurence of part  : 72
num of occurence of part def  : 1
num of occurence of attribute   : 49
num of occurence of port  : 65
num of occurence of port def  : 0
num of occurence of interface def  : 0
num of occurence of interface  : 11
num of occurence of connect  : 18
num of occurence of allocation def  : 0
num of occurence of allocation  : 1
num of occurence of allocate  : 1
num of occurence of case  : 7
num of occurence of actor  : 4
num of occurence of subject  : 10
num of occurence of objective  : 7
num of occurence of assert  : 10
num of occurence of namespace  : 0
num of occurence of requirement def  : 0
num of occurence of stakeholder  : 1
num of occurence of analysis def  : 0
num of occurence of satisfy  : 3


In [10]:
for keyword in keywords:
    if len(df[df[keyword] > 0]) < 9:
        df = df.drop(columns=[keyword])
        
df

Unnamed: 0,file,length,import,calc,package,action,in,out,part,attribute,port,interface,connect,subject,assert,file_short
0,../data/examples/Analysis Examples/AnalysisAnn...,26,3,0,1,1,8,3,0,0,3,0,0,0,0,AnalysisAnnotation.sysml
1,../data/examples/Analysis Examples/Dynamics.sysml,91,2,7,1,5,41,12,0,22,2,0,0,0,1,Dynamics.sysml
2,../data/examples/Analysis Examples/Turbojet St...,110,4,8,4,5,30,2,4,14,4,0,0,0,0,Turbojet Stage Analysis.sysml
3,../data/examples/Analysis Examples/Vehicle Ana...,286,14,8,6,3,66,11,8,30,22,0,0,4,2,Vehicle Analysis Demo.sysml
4,../data/examples/Arrowhead Framework Example/A...,55,3,0,1,1,6,0,0,2,11,0,0,0,0,AHFCoreLib.sysml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,../data/examples/v1 Spec Examples/D.4.7.8 Dyna...,88,3,0,1,0,27,0,0,46,3,0,0,0,0,HSUVDynamics.sysml
85,../data/examples/Variability Examples/VehicleV...,165,6,0,10,20,31,2,36,6,13,0,0,0,3,VehicleVariabilityModel.sysml
86,../data/examples/Vehicle Example/VehicleDefini...,54,5,0,1,0,12,1,7,2,12,3,0,0,0,VehicleDefinitions.sysml
87,../data/examples/Vehicle Example/VehicleIndivi...,111,5,0,4,0,21,0,10,6,6,0,0,0,3,VehicleIndividuals.sysml


In [11]:
df.to_csv("data.csv", index=False)

In [12]:
fig = px.bar(df, x='file_short', y='length', log_y=True,
             title="Length of files, in number of lines",
             color_discrete_sequence=['#2e1b11'])
fig.update_layout(
    bargap=0.2,                     # Add spacing between bars
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028'         # Background of the entire figure
)
fig.show()

In [13]:
# Count occurrences of each bin
counts = df['length'].value_counts().sort_index()

# Convert to a DataFrame
counts_df = counts.reset_index()
counts_df.columns = ['length', 'count']

# Plot using a bar chart for control over sorting
fig = px.histogram(df, y='length', nbins=50, title='Distribution of File Lengths', orientation='h')
fig.update_layout(
    bargap=0.2,                     # Add spacing between bars
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028',        # Background of the entire figure
    font=dict(color='#FFFFFF'),     # Text color (white for contrast)
    title_font=dict(size=20, color='#FFFFFF'),  # Title font
    xaxis=dict(
        title_text='Frequency',        # X-axis label
        title_font=dict(size=16, color='#FFFFFF'),  # X-axis label color
        tickfont=dict(size=12, color='#FFFFFF')     # X-axis tick color
    ),
    yaxis=dict(
        title_text='Length',     # Y-axis label
        title_font=dict(size=16, color='#FFFFFF'),  # Y-axis label color
        tickfont=dict(size=12, color='#FFFFFF')     # Y-axis tick color
    )
)

fig.show()


In [14]:
attribute_counts = copy_of_df.sum()
attribute_counts = attribute_counts.sort_values()
# Create a bar plot
fig = px.bar(
    attribute_counts, 
    x=attribute_counts.index, 
    y=attribute_counts.values, 
    title='Sum of Occurrences per Attribute', 
    labels={'x': 'Attribute', 'y': 'Count'},
    color_discrete_sequence=['#009999']
)

counts_df = attribute_counts.reset_index()
counts_df.columns = ['Attribute', 'Count']

fig = px.bar(
    counts_df,
    x='Attribute',
    y='Count',
    title='Sum of Occurrences per Attribute',
    labels={'Attribute': 'Attribute', 'Count': 'Count'},
    color_discrete_sequence=['#009999']  # Set bar color to match previous plot
)

# Step 5: Update layout for better aesthetics and contrast
fig.update_layout(
    bargap=0.2,                     # Add spacing between bars
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028',        # Background of the entire figure
    font=dict(color='#FFFFFF'),     # Set text color to black for contrast
    title_font=dict(size=20, color='#FFFFFF'),  # Title font settings
    xaxis=dict(
        title_text='Attribute',
        title_font=dict(size=16, color='#FFFFFF'),
        tickfont=dict(size=12, color='#FFFFFF')
    ),
    yaxis=dict(
        title_text='Count',
        title_font=dict(size=16, color='#FFFFFF'),
        tickfont=dict(size=12, color='#FFFFFF')
    )
)
fig.show()

NameError: name 'copy_of_df' is not defined

In [15]:
# split the files per folder
df['folder'] = df['file'].str.split('/').str[-2]

# make a pie chart of the number of files per folder
fig = px.pie(df, names='folder')
fig.update_layout(
    plot_bgcolor='#000028',         # Background of the plot area
    paper_bgcolor='#000028', # Background of the entire figure
    font=dict(color='#FFFFFF'),     # Set text color to black for contrast
    title_font=dict(size=20, color='#FFFFFF'),  # Title font settings
    xaxis=dict(
        title_text='Attribute',
        title_font=dict(size=16, color='#FFFFFF'),
        tickfont=dict(size=12, color='#FFFFFF')
    ),
    
)
fig.show()

In [16]:
df['length'].describe()

count     89.000000
mean      60.303371
std       56.966179
min       12.000000
25%       24.000000
50%       39.000000
75%       88.000000
max      319.000000
Name: length, dtype: float64