In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
import constants as c
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import utils

In [None]:
REFETCH_DATA = False

In [None]:
azure_pat = os.getenv(c.AZURE_TOKEN_ENV_KEY)

In [None]:
lines = 0
with open(c.USINGS_TXT_FILE_PATH, 'r') as f:
    all_files = f.readlines()

In [None]:
lines = 0
file_lines = dict()
for line in all_files:
    splitted = line.strip().split(",")
    devops_url = splitted[0]
    file_path = utils.get_file_path_from_devops_url(devops_url)
    number = splitted[-1]
    number = 0 if number == ''  else int(number)
    lines += int(number)
    file_lines[file_path] = int(number)


In [None]:
print(f"The total amount of lines in analyzed files: {lines}")

In [None]:
number_files = len(file_lines)
total_number_of_lines = sum(file_lines.values())
print("average number of lines: ", total_number_of_lines/number_files)

In [None]:
sorted_files = sorted(file_lines.items(), key=lambda x: x[1], reverse=True)

In [None]:
sorted_files

In [None]:
over_files = utils.get_tuples_over(1000, sorted_files)
n_over = len(over_files)
print(f"Number of files with more than 1000 lines: {n_over}")

In [None]:
# Histrogram
data = file_lines.values()
n_bins = 20  # Number of bins for the histogram
plt.figure(figsize=(10, 6))
min_val = min(data)
max_val = max(data)
bins = np.logspace(np.log10(min_val), np.log10(max_val), num=n_bins)

counts, bin_edges, patches = plt.hist(data, bins=bins, color='blue', alpha=0.7, edgecolor='black')
plt.xscale('log')  # Set x-axis to logarithmic scale

# Normalize the bin counts for the colormap
norm = mcolors.LogNorm(vmin=min(bin_edges), vmax=max(bin_edges))
cmap = cm.RdYlGn_r  # Reverse colormap to go from green to red



# Add a legend for each bin
for count, edge, patch in zip(counts, bin_edges[:-1], patches):
    color = cmap(norm(edge))  # Map the count to a color
    label = f'{int(edge):,} - {int(bin_edges[list(bin_edges).index(edge) + 1]):,}'
    patch.set_label(label)
    patch.set_facecolor(color)



ax = plt.gca()
ax.xaxis.set_major_formatter(ScalarFormatter())  # Format x-axis ticks as plain numbers
fontsize = 12


# Set font size for x and y ticks
plt.tick_params(axis='x', labelsize=fontsize)  # Set font size for x-axis ticks
plt.tick_params(axis='y', labelsize=fontsize)  # Set font size for y-axis ticks
# Remove spines
ax.spines['top'].set_visible(False)    # Remove the top spine
ax.spines['right'].set_visible(False)  # Remove the right spine
ax.spines['left'].set_visible(False)   # Optional: Remove the left spine

# Plot grid, legend and labels 
plt.grid(visible=True, which='both', linestyle='--', linewidth=0.6, alpha=0.9)
plt.legend(loc='upper right')  # Add legend in the upper-right corner
plt.xlabel('Lines (in a file)', fontsize=fontsize)
plt.ylabel('Frequency (# of files)', fontsize=fontsize)
plt.title('BusinessLogic Lines of Code Distribution', fontsize=fontsize+4)
# plt.show()
plt.savefig('../img/lines_histogram.png', dpi=300, bbox_inches='tight')