In [5]:
!pip install pandas transformers torch radon



In [6]:
import pandas as pd
df = pd.read_csv("/content/bugfix_commits_with_llm_final_2.csv")

In [7]:
print(df.columns)

Index(['Hash', 'Message', 'Filename', 'Source code before',
       'Source code current', 'Diff', 'llm_inference', 'rectified_message'],
      dtype='object')


In [8]:
total_number_of_commits= df['Hash'].nunique()
total_number_of_files= df['Filename'].nunique()
avg_files_modified_per_commit= df.groupby('Hash')['Filename'].nunique().mean()
fix_type_distribution= df['llm_inference'].value_counts()
file_extensions = df['Filename'].apply(lambda x: x.split('.')[-1]).value_counts()

print(f"Total number of commits: {total_number_of_commits}")
print(f"Total number of files: {total_number_of_files}")
print(f"Avg modified files per commit: {avg_files_modified_per_commit}")
print("Fix type distribution:")
print(fix_type_distribution)
print("Most frequent file extensions:")
print(file_extensions.head())

Total number of commits: 725
Total number of files: 201
Avg modified files per commit: 2.3586206896551722
Fix type distribution:
llm_inference
add failing test for                            89
fix failing test                                46
add test for                                    43
add more tests                                  41
add missing docstrings                          30
                                                ..
add test for internal foreign key references     1
actor_from_request docstring                     1
update link.rb                                   1
add missing styles to p.zero-results             1
add v2.0.0-beta.1                                1
Name: count, Length: 1023, dtype: int64
Most frequent file extensions:
Filename
py      1171
rst      274
html     112
yml       45
md        21
Name: count, dtype: int64


In [9]:
from radon.metrics import mi_visit, h_visit
from radon.complexity import cc_visit

In [10]:
def compute_metrics(code):
  if not isinstance(code, str) or code.strip() == "":
    return {'mi': 0, 'cc': 0, 'loc': 0}

  mi_score = mi_visit(code, True)
  cc_scores = cc_visit(code)
  total_cc = sum([block.complexity for block in cc_scores])
  loc = len(code.splitlines())
  return {'mi': mi_score, 'cc': total_cc, 'loc': loc}

In [11]:
import ast

def is_valid_python_code(code):
  if not isinstance(code, str):
    return False
  try:
    ast.parse(code)
    return True
  except SyntaxError:
    return False

In [12]:
df = df[df['Filename'].str.endswith('.py')]

df = df[df['Source code before'].apply(is_valid_python_code)]
df = df[df['Source code current'].apply(is_valid_python_code)]



In [13]:
metrics_before = df['Source code before'].apply(compute_metrics)
metrics_after = df['Source code current'].apply(compute_metrics)

df['MI_Before']= metrics_before.apply(lambda x: x['mi'])
df['CC_Before']= metrics_before.apply(lambda x: x['cc'])
df['LOC_Before']= metrics_before.apply(lambda x: x['loc'])

df['MI_After']= metrics_after.apply(lambda x: x['mi'])
df['CC_After']= metrics_after.apply(lambda x: x['cc'])
df['LOC_After']= metrics_after.apply(lambda x: x['loc'])

df['MI_Change']= df['MI_After'] - df['MI_Before']
df['CC_Change']= df['CC_After'] - df['CC_Before']
df['LOC_Change']= df['LOC_After'] - df['LOC_Before']



In [14]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
!pip install sacrebleu
import sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [15]:
tokenizer= AutoTokenizer.from_pretrained("microsoft/codebert-base")
model= AutoModel.from_pretrained("microsoft/codebert-base")
model.eval()

def embed_code(code):
  inputs= tokenizer(code, return_tensors="pt", truncation=True, max_length=512)
  with torch.no_grad():
    outputs= model(**inputs)
  embeddings = outputs.last_hidden_state[:, 0, :].numpy()
  return embeddings

def semantic_similarity(code_before, code_after):
  if not code_before or not code_after:
    return 0.0
  try:
    emb_before = embed_code(code_before)
    emb_after = embed_code(code_after)
    sim = cosine_similarity(emb_before, emb_after)[0][0]
    return float(sim)
  except Exception as e:
    print(f"Error in semantic similarity: {e}")
    return 0.0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [16]:
def tokenize_code(code):
  return code.split()

def token_similarity(code_before, code_after):
  if not code_before or not code_after:
    return 0.0
  reference = [tokenize_code(code_before)]
  hypothesis = tokenize_code(code_after)
  bleu = sacrebleu.sentence_bleu(hypothesis, reference)
  return bleu.score / 100.0

In [17]:
df['Semantic_Similarity'] = df.apply(lambda row: semantic_similarity(row['Source code before'], row['Source code current']), axis=1)
df['Token_Similarity'] = df.apply(lambda row: token_similarity(row['Source code before'], row['Source code current']), axis=1)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

TypeError: BLEU: The argument `hyp` should be a string.

In [1]:
import pandas as pd
df= pd.read_csv("/content/metrics.csv")

In [2]:
df.drop(columns=['MI_Before', 'MI_After', 'CC_Before', 'CC_After', 'LOC_Before', 'LOC_After'], inplace=True)

In [3]:
semantic_tresh= 0.80
token_tresh= 0.75
df['Semantic_class'] = df['Semantic_Similarity'].apply(lambda x: 'Minor' if x >= semantic_tresh else 'Major')
df['Token_class'] = df['Token_Similarity'].apply(lambda x: 'Minor' if x >= token_tresh else 'Major')

In [4]:
df['Classes_Agree'] = df.apply(lambda row: 'YES' if row['Semantic_class'] == row['Token_class'] else 'NO', axis=1)

In [5]:
df.head()

Unnamed: 0,Hash,Message,Filename,Source code before,Source code current,Diff,llm_inference,rectified_message,MI_Change,CC_Change,LOC_Change,Semantic_Similarity,Token_Similarity,Semantic_class,Token_class,Classes_Agree
0,f571b19d8a5fd3a19fdf679421fd55a8edbf5295,sqlerrors() decorator catching and returning u...,app.py,from sanic import Sanic\nfrom sanic import res...,from sanic import Sanic\nfrom sanic import res...,"@@ -2,6 +2,7 @@ from sanic import Sanic\n from...",add more sanic routes,@app.route('/<table:[a-zA-Z0-9].*>.json')\n+@s...,-5.492447,1,17,0.998624,0.795017,Minor,Minor,YES
1,6a0c5de6154893eb4269dc7b1f160726ec6395f2,ensure_build_metadata() function for metadata\...,app.py,from sanic import Sanic\nfrom sanic import res...,from sanic import Sanic\nfrom sanic import res...,"@@ -2,8 +2,55 @@ from sanic import Sanic\n fro...",add test for build_metadata,......................... write ( str (. read (,-9.744388,10,48,0.995581,0.503184,Minor,Major,NO
2,b2372605d63248f422b6e67cb5c392236a3aa612,Implemented multi-db support plus initial URL ...,app.py,from sanic import Sanic\nfrom sanic import res...,from sanic import Sanic\nfrom sanic import res...,"@@ -1,5 +1,6 @@\n from sanic import Sanic\n fr...",add docs for sanic build metadata,"@app.sqlerrors(fn):\n @wraps (fn)-76,32 +8...",-3.010328,8,63,0.998422,0.584143,Minor,Major,NO
3,12f7e1dc5624d14f644abead18bd90b420b6d97e,Hashed URLs now have far-future cache expiry\n...,app.py,from sanic import Sanic\nfrom sanic import res...,from sanic import Sanic\nfrom sanic import res...,"@@ -75,22 +75,31 @@ class BaseView(HTTPMethodV...",add more documentation to base view,". add_route(DatabaseView.as_view(), '/<db_name...",-1.78109,1,12,1.0,0.916062,Minor,Minor,YES
4,b20d7119e4f6506cdb9d5d11322e28130823adfd,Implemented template inheritance and brought b...,app.py,from sanic import Sanic\nfrom sanic import res...,from sanic import Sanic\nfrom sanic import res...,"@@ -86,7 +86,13 @@ class BaseView(HTTPMethodVi...",add support for sql errors in views,"file:\n""Implemented template inheritance and b...",0.523468,1,-8,1.0,0.959698,Minor,Minor,YES


In [None]:
df.to_csv("metrics.csv", index=False)