In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

input_file = "/home/tim/cluster/openmp-usage-analysis/result.csv"

plot_path= "/home/tim/paper/openmp-analysis-paper/media"
df = pd.read_csv(input_file, index_col=0)


In [None]:
OVERHEAD_THRESHOLD = 100

In [None]:
df["large_overhead"] = df["instructions_weighted"] < OVERHEAD_THRESHOLD
df["small_overhead"] = df["instructions_weighted"] >= OVERHEAD_THRESHOLD

In [None]:
len(df["Code"].unique())

In [None]:
pass

In [None]:
# Number of parallel regions per Repo
ax = df.groupby("Code").size().value_counts().sort_index().plot.bar()
ax.set_xlabel("Number of parallel Regions")
ax.set_ylabel("number of Codes")
ax.set_title("Number of parallel regions per Repo")
plt.savefig(os.path.join(plot_path, "num_regions.pdf"))
plt.tight_layout()
plt.show()


In [None]:
df["default_tripcount_loops"].sum()

In [None]:
df["known_tripcount_loops"].sum()

In [None]:
df["thread_dependant_trip_count_loops"].sum()

Fragen:
Wie viele user beachten die aspekte um gute openmp usage zu haben


Analyse per project

Verhältnis overhead/parallel region per project
overhead mit 100 instructions annehmen
codegröße insgesamt
Wie viele regions per project, welcher typ

welche konstrukte

In [None]:
ax = df["instructions_flat"].plot.hist(bins=np.logspace(start=0,stop=6))
ax.set_xscale("log")
ax.set_xlabel("Instructions in Parallel region")
ax.set_title("Distribution of Instruction counts of Parallel region")
plt.savefig(os.path.join(plot_path, "flat_instructions_distribution.pdf"))

In [None]:
ax = df["instructions_weighted"].plot.hist(bins=np.logspace(start=0,stop=6,num=50))
ax.set_xscale("log")
ax.set_xlabel("Instructions in Parallel region")
ax.set_title("Distribution of Instruction counts of Parallel region")
plt.savefig(os.path.join(plot_path, "weighted_instructions_distribution.pdf"))

In [None]:
df_overhead_rating = df.groupby("Code")[["large_overhead", "small_overhead"]].sum()
df_overhead_rating['percentage'] = df_overhead_rating['large_overhead'] / (
            df_overhead_rating['large_overhead'] + df_overhead_rating['small_overhead'])

In [None]:
# plot an excerpt
ax = df_overhead_rating.iloc[0:10].plot.barh(stacked=True)

In [None]:
# Repos with at least one example of large overhead region
(df_overhead_rating["large_overhead"] > 1).sum()

In [None]:
ax = df_overhead_rating['percentage'].plot.hist(bins=100)
ax.set_xlabel("Fraction of high overhead parallel regions")
ax.set_ylabel("number of Codes")
ax.set_title("Fraction of high overhead parallel regions per project")

#plt.savefig(os.path.join(plot_path,"plot.pdf"))

In [None]:
fig, ax = plt.subplots(figsize=(10, 2))

data = df_overhead_rating['percentage'].sort_values()

# Normalize the percentage values to use as colors
# Normalize the percentage values to use as colors
norm = plt.Normalize(data.min(), data.max())
sm = plt.cm.ScalarMappable(cmap="cool", norm=norm)
colors = sm.to_rgba(data)

# Plot the horizontal bar segments
start = 0
for i, percentage in enumerate(data):
    ax.barh(0, 1, left=start, color=colors[i]
            #, edgecolor='black'
            )
    start += 1

# Set the labels and title
ax.set_xlabel("Number of Codes")
ax.set_yticks([])  # Hide y-axis ticks
ax.set_xlim([0, len(data)])
ax.set_title("Fraction of high overhead parallel regions per project")

# Add a legend
cbar = plt.colorbar(sm, ax=ax, orientation='horizontal', pad=0.4)
cbar.set_label('Fraction of high overhead parallel regions')
fig.tight_layout()
plt.savefig(os.path.join(plot_path,"fraction_per_project.pdf"))
plt.show()


In [None]:
#df_loops = df.groupby("Code")[["default_tripcount_loops","known_tripcount_loops","thread_dependant_trip_count_loops"]].sum()

In [None]:
df["loop_sum"] = df[["default_tripcount_loops","known_tripcount_loops","thread_dependant_trip_count_loops"]].sum(axis=1)

In [None]:
df

In [None]:
ax = df["loop_sum"].plot.hist(bins=np.logspace(start=0,stop=2,num=20))
ax.set_xscale("log")
ax.set_xlabel("Loops in Parallel region")
ax.set_title("Distribution of loop counts in Parallel region")
#plt.savefig(os.path.join(plot_path, "loops_distribution.pdf"))

In [None]:
# manual sorting into the bins
bins = np.logspace(start=0, stop=2, num=21)

default = [0 for _ in range(len(bins)+1)]
known = [0 for _ in range(len(bins)+1)]
thread = [0 for _ in range(len(bins)+1)]

for idx,row in df.iterrows():    
    if row['loop_sum']>0:
        b_index = np.searchsorted(bins ,row["loop_sum"])
        default[b_index] += row["default_tripcount_loops"] / row["loop_sum"]
        known[b_index] += row["known_tripcount_loops"] /row["loop_sum"]
        thread[b_index] += row["thread_dependant_trip_count_loops"]/row["loop_sum"]

default=default[:-2]
known=known[:-2]
thread=thread[:-2]

fig, ax = plt.subplots()
ax.bar(    
    bins[:-1],
    default,    
    label="default_tripcount_loops",
    width=np.diff(bins),
    align='edge',
)
ax.bar(    
    bins[:-1],
    known,   
    bottom = default,
    label="known_tripcount_loops",
    width=np.diff(bins),
    align='edge',
)
ax.bar(    
    bins[:-1],
    thread,   
    bottom = np.add(default,known),
    label="thread_dependant_tripcount_loops",
    width=np.diff(bins),
    align='edge',
)

ax.set_xscale("log")
ax.set_xlabel("Loops in Parallel region")
ax.set_ylabel("Frequency")
# bar coloring according to percentages of loops
ax.set_title("Distribution of loop counts in Parallel region")
ax.legend()

plt.savefig(os.path.join(plot_path, "loops_distribution.pdf"))
plt.show()