In [37]:
import os
import subprocess
import pandas as pd
import ast

In [38]:
repos_dir = "./java_repos"
results_dir = "./ck_results"
ck_jar_path = "./ck/target/ck-0.7.1-SNAPSHOT-jar-with-dependencies.jar"

In [39]:
df = pd.read_csv("top_1000_java_repos.csv")
df

Unnamed: 0,name,owner,createdAt,updatedAt,stargazers,releases
0,JavaGuide,{'login': 'Snailclimb'},2018-05-07T13:27:00Z,2025-03-15T14:00:54Z,148698,0
1,hello-algo,{'login': 'krahets'},2022-11-04T11:08:34Z,2025-03-15T18:28:44Z,110186,9
2,java-design-patterns,{'login': 'iluwatar'},2014-08-09T16:45:18Z,2025-03-15T18:26:47Z,90999,0
3,mall,{'login': 'macrozheng'},2018-04-04T01:11:44Z,2025-03-15T14:56:03Z,79471,3
4,advanced-java,{'login': 'doocs'},2018-10-06T11:38:30Z,2025-03-15T16:17:07Z,77236,1
...,...,...,...,...,...,...
995,concurrency-limits,{'login': 'Netflix'},2017-12-11T19:32:41Z,2025-03-13T01:02:42Z,3307,102
996,SuperTextView,{'login': 'chenBingX'},2017-04-17T11:32:41Z,2025-03-11T21:59:26Z,3306,33
997,99-problems,{'login': 'shekhargulati'},2015-11-22T05:56:33Z,2025-03-04T17:04:16Z,3304,0
998,PRDownloader,{'login': 'amitshekhariitbhu'},2017-11-11T19:29:38Z,2025-03-14T06:37:04Z,3303,8


In [40]:
df['owner'] = df['owner'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['owner'] = df['owner'].apply(lambda x: x['login'] if isinstance(x, dict) and 'login' in x else x)
df["owner"]

# TODO vou passar isto para o outro lab

0             Snailclimb
1                krahets
2               iluwatar
3             macrozheng
4                  doocs
             ...        
995              Netflix
996            chenBingX
997        shekhargulati
998    amitshekhariitbhu
999           trojan-gfw
Name: owner, Length: 1000, dtype: object

In [41]:
summary_results_path = "summarized_results.csv"
if os.path.exists(summary_results_path):
    results_df = pd.read_csv(summary_results_path)
else:
    results_df = pd.DataFrame(columns=df.columns.tolist() + [
        'cbo_mean', 'cbo_median', 'cbo_std',
        'dit_mean', 'dit_median', 'dit_std',
        'lcom_mean', 'lcom_median', 'lcom_std'
    ])


In [42]:
def sanitize_repo_name(repo_name):
    """Sanitize repository name by replacing '/' and '-' with '_'"""
    return repo_name.replace('/', '_').replace('-', '_')

In [43]:
def clone_repository(repo_url, repo_name):
    """Clona o repositório caso ainda não exista."""
    repo_path = os.path.join(repos_dir, repo_name.replace('/', '_'))
    if not os.path.exists(repo_path):
        subprocess.run(["git", "clone", repo_url, repo_path], check=True)
    return repo_path

In [44]:
def run_ck(project_dir, output_dir):
    """Executa a ferramenta CK para análise de métricas."""
    subprocess.run([
        "java", "-jar", ck_jar_path,
        project_dir,
        "true",  
        "0",     
        "true",  
        output_dir
    ], check=True)

In [45]:
results_df

Unnamed: 0,name,owner,createdAt,updatedAt,stargazers,releases,cbo_mean,cbo_median,cbo_std,dit_mean,dit_median,dit_std,lcom_mean,lcom_median,lcom_std
0,JavaGuide,Snailclimb,2018-05-07T13:27:00Z,2025-03-15T14:00:54Z,148698,0,,,,,,,,,
1,hello-algo,krahets,2022-11-04T11:08:34Z,2025-03-15T18:28:44Z,110186,9,1.874419,2.0,1.512579,1.000000,1.0,0.000000,4.706977,1.0,10.457868
2,java-design-patterns,iluwatar,2014-08-09T16:45:18Z,2025-03-15T18:26:47Z,90999,0,3.363537,3.0,2.606119,1.203057,1.0,0.533152,1.876092,0.0,6.555601
3,mall,macrozheng,2018-04-04T01:11:44Z,2025-03-15T14:56:03Z,79471,3,3.934211,3.0,4.966914,1.239474,1.0,0.439211,1110.803947,13.0,8155.555246
4,advanced-java,doocs,2018-10-06T11:38:30Z,2025-03-15T16:17:07Z,77236,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,cat,dianping,2012-12-05T02:31:48Z,2025-03-15T15:05:21Z,18804,3,5.527484,4.0,6.049049,1.434461,1.0,0.708367,23.035941,1.0,104.205827
78,DSA-Bootcamp-Java,kunal-kushwaha,2021-07-30T12:23:25Z,2025-03-15T19:11:14Z,18533,0,1.268775,1.0,1.240397,1.083004,1.0,0.340733,3.501976,1.0,11.010682
79,SpringCloudLearning,forezp,2017-04-08T10:06:56Z,2025-03-14T11:02:27Z,17924,0,4.552448,4.0,2.366665,1.048951,1.0,0.216524,0.545455,0.0,1.860362
80,Algorithms,williamfiset,2017-03-18T02:50:46Z,2025-03-15T17:59:45Z,17747,0,2.611231,2.0,1.791044,1.053996,1.0,0.226254,16.196544,3.0,38.507016


In [None]:
# Processando todos os repos
summary_results = []
for index, row in df.iloc[125:200].iterrows():
    repo_name = row['name']
    repo_owner = row['owner']
    
    class_csv_path = os.path.join(results_dir, f"{sanitize_repo_name(repo_name)}class.csv")

    if repo_name in results_df['name'].values:
        print(f"⚠️ Repository {repo_name} already processed. Skipping...")
        continue

    if os.path.exists(class_csv_path):
        print(f"⚠️ CK results for {repo_name} already exist. Summarizing data...")

        try:
            df_ck = pd.read_csv(class_csv_path)
        except:
            print(f"⚠️ Empty CK file for {repo_name}, skipping analysis.")
            continue

        filtered_df = df_ck[['class', 'cbo', 'dit', 'lcom']]

        summary = {
            'cbo_mean': filtered_df['cbo'].mean(),
            'cbo_median': filtered_df['cbo'].median(),
            'cbo_std': filtered_df['cbo'].std(),
            'dit_mean': filtered_df['dit'].mean(),
            'dit_median': filtered_df['dit'].median(),
            'dit_std': filtered_df['dit'].std(),
            'lcom_mean': filtered_df['lcom'].mean(),
            'lcom_median': filtered_df['lcom'].median(),
            'lcom_std': filtered_df['lcom'].std(),
        }
        repo_summary = row.copy()
        for key, value in summary.items():
            repo_summary[key] = value

        summary_results.append(repo_summary)
        continue  
    
    repo_url = f"https://github.com/{repo_owner}/{repo_name}.git"
    
    print(f"🔄 Cloning {repo_owner}/{repo_name}...")
    try:
        repo_path = clone_repository(repo_url, repo_name)
        if repo_path is None:
            continue
        
        repo_results_dir = os.path.join(results_dir, sanitize_repo_name(repo_name))
        os.makedirs(repo_results_dir, exist_ok=True)

        print(f"⚙️ Running CK analysis for {repo_name}...")
        run_ck(repo_path, repo_results_dir)
        print(f"✅ Analysis completed for {repo_name}!")

        # Process CK results
        class_csv_path = os.path.join(results_dir, f"{sanitize_repo_name(repo_name)}class.csv")
        if os.path.exists(class_csv_path):
            df_ck = pd.read_csv(class_csv_path)
            filtered_df = df_ck[['class', 'cbo', 'dit', 'lcom']]

            summary = {
                'cbo_mean': filtered_df['cbo'].mean(),
                'cbo_median': filtered_df['cbo'].median(),
                'cbo_std': filtered_df['cbo'].std(),
                'dit_mean': filtered_df['dit'].mean(),
                'dit_median': filtered_df['dit'].median(),
                'dit_std': filtered_df['dit'].std(),
                'lcom_mean': filtered_df['lcom'].mean(),
                'lcom_median': filtered_df['lcom'].median(),
                'lcom_std': filtered_df['lcom'].std(),
            }
            repo_summary = row.copy()
            for key, value in summary.items():
                repo_summary[key] = value

            summary_results.append(repo_summary)
        else:
            print(f"⚠️ CK output missing for {repo_name}, skipping analysis.")
    
    except Exception as e:
        print(f"⚠️ Error processing {repo_owner}/{repo_name}: {e}")
        continue

⚠️ Repository awesome-system-design-resources already processed. Skipping...
⚠️ Repository eladmin already processed. Skipping...
🔄 Cloning iBotPeaches/Apktool...
⚙️ Running CK analysis for Apktool...
✅ Analysis completed for Apktool!
⚠️ CK results for graal already exist. Summarizing data...
⚠️ Empty CK file for graal, skipping analysis.
⚠️ Repository Chat2DB already processed. Skipping...
⚠️ CK results for jdk already exist. Summarizing data...
⚠️ Empty CK file for jdk, skipping analysis.
⚠️ Repository GSYVideoPlayer already processed. Skipping...
⚠️ Repository conductor already processed. Skipping...
⚠️ Repository HikariCP already processed. Skipping...
⚠️ Repository DoKit already processed. Skipping...
🔄 Cloning apache/shardingsphere...
⚠️ Error processing apache/shardingsphere: Command '['git', 'clone', 'https://github.com/apache/shardingsphere.git', './java_repos\\shardingsphere']' returned non-zero exit status 128.
⚠️ Repository mybatis-3 already processed. Skipping...
⚠️ Reposi

In [47]:
new_results_df = pd.DataFrame(summary_results)
if not new_results_df.empty:
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)
    results_df.to_csv(summary_results_path, index=False)
    print("✅ Summarized results updated in summarized_results.csv")
else:
    print("⚠️ No new repositories processed.")

✅ Summarized results updated in summarized_results.csv


In [48]:
def clean_unnecessary_ck_files(output_dir):
    """Remove arquivos CK desnecessários, mantendo apenas o class.csv."""
    for file in os.listdir(output_dir):
        file_path = os.path.join(output_dir, file)
        if os.path.isfile(file_path) and not file.endswith("class.csv"):
            os.remove(os.path.join(output_dir, file))

In [49]:
ck_results = "./ck_results"

clean_unnecessary_ck_files(ck_results)
