In [None]:
import os
import subprocess
import pandas as pd
import ast

In [None]:
repos_dir = "./java_repos"
results_dir = "./ck_results"
ck_jar_path = "./ck/target/ck-0.7.1-SNAPSHOT-jar-with-dependencies.jar"

In [None]:
df = pd.read_csv("top_1000_java_repos.csv")
df

In [None]:
df['owner'] = df['owner'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['owner'] = df['owner'].apply(lambda x: x['login'] if isinstance(x, dict) and 'login' in x else x)
df["owner"]

In [None]:
summary_results_path = "summarized_results.csv"
if os.path.exists(summary_results_path):
    results_df = pd.read_csv(summary_results_path)
else:
    results_df = pd.DataFrame(columns=df.columns.tolist() + [
        'cbo_mean', 'cbo_median', 'cbo_std',
        'dit_mean', 'dit_median', 'dit_std',
        'lcom_mean', 'lcom_median', 'lcom_std'
    ])


In [None]:
def sanitize_repo_name(repo_name):
    """Sanitize repository name by replacing '/' and '-' with '_'"""
    return repo_name.replace('/', '_').replace('-', '_')

In [None]:
def clone_repository(repo_url, repo_name):
    """Clona o repositório caso ainda não exista."""
    repo_path = os.path.join(repos_dir, repo_name.replace('/', '_'))
    if not os.path.exists(repo_path):
        subprocess.run(["git", "clone", repo_url, repo_path], check=True)
    return repo_path

In [None]:
def run_ck(project_dir, output_dir):
    """Executa a ferramenta CK para análise de métricas."""
    subprocess.run([
        "java", "-jar", ck_jar_path,
        project_dir,
        "true",  
        "0",     
        "true",  
        output_dir
    ], check=True)

In [None]:
results_df

In [None]:
# Processando todos os repos

summary_results = []
for index, row in df.iterrows():
    repo_name = row['name']
    repo_owner = row['owner']
    
    class_csv_path = os.path.join(results_dir, f"{sanitize_repo_name(repo_name)}class.csv")

    if repo_name in results_df['name'].values:
        print(f"⚠️ Repository {repo_name} already processed. Skipping...")
        continue

    if os.path.exists(class_csv_path):
        print(f"⚠️ CK results for {repo_name} already exist. Summarizing data...")

        try:
            df_ck = pd.read_csv(class_csv_path)
        except:
            print(f"⚠️ Empty CK file for {repo_name}, skipping analysis.")
            continue

        filtered_df = df_ck[['class', 'cbo', 'dit', 'lcom']]

        summary = {
            'cbo_mean': filtered_df['cbo'].mean(),
            'cbo_median': filtered_df['cbo'].median(),
            'cbo_std': filtered_df['cbo'].std(),
            'dit_mean': filtered_df['dit'].mean(),
            'dit_median': filtered_df['dit'].median(),
            'dit_std': filtered_df['dit'].std(),
            'lcom_mean': filtered_df['lcom'].mean(),
            'lcom_median': filtered_df['lcom'].median(),
            'lcom_std': filtered_df['lcom'].std(),
        }
        repo_summary = row.copy()
        for key, value in summary.items():
            repo_summary[key] = value

        summary_results.append(repo_summary)
        continue  
    
    repo_url = f"https://github.com/{repo_owner}/{repo_name}.git"
    
    print(f"🔄 Cloning {repo_owner}/{repo_name}...")
    try:
        repo_path = clone_repository(repo_url, repo_name)
        if repo_path is None:
            continue
        
        repo_results_dir = os.path.join(results_dir, sanitize_repo_name(repo_name))
        os.makedirs(repo_results_dir, exist_ok=True)

        print(f"⚙️ Running CK analysis for {repo_name}...")
        run_ck(repo_path, repo_results_dir)
        print(f"✅ Analysis completed for {repo_name}!")

        # Process CK results
        class_csv_path = os.path.join(results_dir, f"{sanitize_repo_name(repo_name)}class.csv")
        if os.path.exists(class_csv_path):
            df_ck = pd.read_csv(class_csv_path)
            filtered_df = df_ck[['class', 'cbo', 'dit', 'lcom']]

            summary = {
                'cbo_mean': filtered_df['cbo'].mean(),
                'cbo_median': filtered_df['cbo'].median(),
                'cbo_std': filtered_df['cbo'].std(),
                'dit_mean': filtered_df['dit'].mean(),
                'dit_median': filtered_df['dit'].median(),
                'dit_std': filtered_df['dit'].std(),
                'lcom_mean': filtered_df['lcom'].mean(),
                'lcom_median': filtered_df['lcom'].median(),
                'lcom_std': filtered_df['lcom'].std(),
            }
            repo_summary = row.copy()
            for key, value in summary.items():
                repo_summary[key] = value

            summary_results.append(repo_summary)
        else:
            print(f"⚠️ CK output missing for {repo_name}, skipping analysis.")
    
    except Exception as e:
        print(f"⚠️ Error processing {repo_owner}/{repo_name}: {e}")
        continue

In [None]:
new_results_df = pd.DataFrame(summary_results)
if not new_results_df.empty:
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)
    results_df.to_csv(summary_results_path, index=False)
    print("✅ Summarized results updated in summarized_results.csv")
else:
    print("⚠️ No new repositories processed.")

In [None]:
def clean_unnecessary_ck_files(output_dir):
    """Remove arquivos CK desnecessários, mantendo apenas o class.csv."""
    for file in os.listdir(output_dir):
        file_path = os.path.join(output_dir, file)
        if os.path.isfile(file_path) and not file.endswith("class.csv"):
            os.remove(os.path.join(output_dir, file))

In [None]:
ck_results = "./ck_results"

clean_unnecessary_ck_files(ck_results)
