In [64]:
from pprint import pprint
from matplotlib import pyplot as plot
from gitminer import *

In [65]:
repo = Repo("quizzology")

In [66]:
little_labels = dict(with_labels=True, font_size=7)
medium_labels = dict(with_labels=True, font_size=7)

In [67]:
repo_graph = graph_file_to_file(repo)

# Neighbors
## Who are the top 10 most-connected nodes?

This is basically handed to us by the degree accessor, which gives us the file and its number of edges.


In [68]:
count_and_name = (
    (count, name)
    for (name, count) in repo_graph.degree
)
x = sorted(count_and_name, reverse=True)
print(x[:10])


# Top 20

What are the top 20 most-changed files? We used to get this with a complicated git/sort/uniq script. Here it's much easier.

In [69]:
# Commit Graph - commits-to-files
from typing import NamedTuple


class CommitNode(NamedTuple):
    binsha: str
    message: str
    timestamp: datetime


commit_graph = nx.DiGraph()
for commit in repo.iter_commits():
    item = CommitNode(commit.hexsha, commit.message, commit.committed_datetime)
    for file in commit.stats.files:
        commit_graph.add_edge(item, file)


In [70]:
# Most committed file
commits_per_file = Counter()
for (commit, file) in commit_graph.edges:
    commits_per_file[file] += 1

print("The most frequently committed files are:")
for (file, count) in commits_per_file.most_common()[:10]:
    print(f'    {file}: {count}')

# Create a plot 

x = [5, 2, 9, 4, 7]
y = [10, 5, 8, 4, 2]
plot.plot(x, y)
plot.show()

# Draw a network plot of some kind

graph = nx.petersen_graph()  # well known sample data
subax = plot.subplot(121)
nx.draw(graph)

repo_plot = plot.subplot(122)
nx.draw(repo_graph, **little_labels)



help(nx.draw)


In [71]:

pprint(list(nx.connected_components(repo_graph)))


In [72]:
# Perhaps we should trim the data set so that we don't plot every single 
# edge and node in one ugly graph.

busiest_graph = nx.Graph()
busiest_graph.add_edges_from(
    edge
    for edge in repo_graph.edges
    if repo_graph.get_edge_data(*edge).get('count', 0) > 10
)


In [73]:
from networkx import spring_layout

layout = spring_layout(busiest_graph, k=0.8, iterations=50)
nx.draw(busiest_graph, layout, **little_labels, node_size=50)


In [74]:
nx.draw_random(busiest_graph, **little_labels)


# Concept

If we combine *most committed*, and *most connected*, and some *complexity* measure, then we should end up with a basis for a ranked "refactoring candidate" list. 

## Hypothesis: TARGET SELECTION
Imagine pointing a tool at a codebase, and it tells you where to concentrate your efforts on code renewal, and maybe you re-run it monthly or weekly? Would this make a significant difference in fluidity, fluency, speed, and quality? 

## Notes

- This needs to be time-limited or it will be based on all-time, not recent activity.
- When you first rework a file, it will still show up in the list
- Maybe when we refactor a file, it can drop it out of the candidate list. If it is still aproblem, it will reappear later, when that refactoring commit "ages out."
- if this only works with "conventional commits," that's probably okay. We could also support Belshee's cryptic commit trickery.


In [75]:
print(list(nx.connected_components(repo_graph))[-1])

Interesting ideas
* Average commit size
* Tests included in commits?
* Connected groups
* Most edited files
* Commit frequency
* Defects vs Features vs Refactors vs Other (conventional commits?)
* Ticket numbers and clustering of same

In [76]:
# Average commit size
repo = Repo('quizzology')
commits = [(commit.author.name, commit.authored_datetime, len(commit.stats.files))
           for commit
           in repo.iter_commits()
           ]
pprint(commits[:3])

In [77]:
# How to break into separate components? 

In [78]:
fig, ax = plot.subplots(figsize=(24, 24))
nx.draw_kamada_kawai(busiest_graph, **medium_labels)

In [79]:
#What if we remove the most heavily connected items instead of the least-heavily?


In [80]:
from statistics import mean, stdev, variance, multimode


def print_stats(filecounts):
    print(f'  Largest commit is {max(filecounts)} files')
    print(f'  Average commit is {mean(filecounts)} files')
    print(f'  Standard Deviation is {stdev(filecounts)}')
    print(f'  Variance is {variance(filecounts)}')
    print(f'  Multimode is {multimode(filecounts)}')

In [81]:
print("FULL repository Data Set")
print_stats([x for (_, _, x) in commits])

In [82]:
# Maybe a multigraph is wise?
repo = Repo("quizzology")
repo_multigraph = nx.MultiGraph()
for commit in repo.iter_commits():
    for (left, right) in combinations(commit.stats.files, 2):
        repo_multigraph.add_edge(left, right)

# Activity Graph Concept
A graph of the number of commits per day might be useful - a stacked bar graph if you can tell what kinds of commits.
I imagine coloring the stack by 'conventional commit' topics: fixes on bottom, then feature, then whatever else.
'Commit Cadence'

Can we discern the kinds of commits we're looking at via ticket names, conventional commits, or belshee commits?

Can we eliminate trivial commits (less than one line of change, less than 10 characters, etc.)?

In [83]:
# This is probably the wrong visualization. 
for topic in nx.connected_components(repo_multigraph):
    nx.draw(repo_multigraph, **little_labels)
    # Kittens appear more than once, as do quizzical cats. I think this provides too much info.

In [84]:
from datetime import datetime, timedelta


def within_dates(start_date, end_date):
    return [count
            for (_, timestamp, count) in commits
            if start_date <= timestamp.date() <= end_date
            ]


end_date = datetime.today().astimezone().date()
start_date = end_date - timedelta(days=400)
print(f'From {start_date} to {end_date}')

dataset = within_dates(start_date, end_date)
print(f'commits: {len(dataset)}')
print_stats(dataset)

# Rolling graph of just the past N days

Use the timestamp from the repo_graph to filter down the commits

Graph the commits as of a certain day, with a window of N days prior
Graph repeatedly, show maybe bar graphs and such? 

