In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
repos = pd.read_csv('repo_table_metrics_collected.csv', dtype={'repo_path': str, 'status': str, 'time_taken': int, 'dotnet_versions': str, 'sourcemeter_result': str})
repos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 790 entries, 0 to 789
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                790 non-null    object 
 1   html_url            790 non-null    object 
 2   git_url             790 non-null    object 
 3   forks               790 non-null    int64  
 4   stars               790 non-null    int64  
 5   language            790 non-null    object 
 6   size                790 non-null    int64  
 7   csharp_percent      790 non-null    float64
 8   csharp_lines        790 non-null    int64  
 9   commit_count        790 non-null    int64  
 10  default_branch      790 non-null    object 
 11  repo_path           779 non-null    object 
 12  status              59 non-null     object 
 13  time_taken          790 non-null    int32  
 14  dotnet_versions     554 non-null    object 
 15  sourcemeter_result  284 non-null    object 
dtypes: float

In [3]:
repos.head()

Unnamed: 0,name,html_url,git_url,forks,stars,language,size,csharp_percent,csharp_lines,commit_count,default_branch,repo_path,status,time_taken,dotnet_versions,sourcemeter_result
0,PeterWaher/IoTGateway,https://github.com/PeterWaher/IoTGateway,git://github.com/PeterWaher/IoTGateway.git,72,206,C#,10241832,0.985052,28756660,6455,master,D:\DataCollection\repos\PeterWaher_IoTGateway,,0,framework/v4.6.2;core/netstandard1.3;core/netc...,
1,allenai/ai2thor,https://github.com/allenai/ai2thor,git://github.com/allenai/ai2thor.git,197,892,C#,9647416,0.782026,4151121,7023,main,D:\DataCollection\repos\allenai_ai2thor,,0,,
2,Realm667/WolfenDoom,https://github.com/Realm667/WolfenDoom,git://github.com/Realm667/WolfenDoom.git,26,247,C#,7436583,0.952794,3013725,11780,master,D:\DataCollection\repos\Realm667_WolfenDoom,,0,,0.0
3,Unity-Technologies/Graphics,https://github.com/Unity-Technologies/Graphics,git://github.com/Unity-Technologies/Graphics.git,738,2240,C#,6433544,0.81252,33793180,26239,master,D:\DataCollection\repos\Unity-Technologies_Gra...,,0,,
4,UnityTechnologies/open-project-1,https://github.com/UnityTechnologies/open-proj...,git://github.com/UnityTechnologies/open-projec...,1999,5410,C#,4876097,0.899314,706695,1962,main,D:\DataCollection\repos\UnityTechnologies_open...,,0,,


In [4]:
(repos['sourcemeter_result'] == '2').sum()

118

In [5]:
sourcemeter_path = 'D:\\SourceMeter-10.2.0-x64-Windows\\CSharp\\AnalyzerCSharp.exe'

def call_sourcemeter(sln_path, args: dict):
    return os.system(f'{sourcemeter_path} -input="{sln_path}" {" ".join([f"-{k}={v}" for k, v in args.items()])}')

def verify_sourcemeter_result(args: dict):
    results_dir = '.\\Results'
    # check if in results/{projectName} there is a file called *Class.csv and *Method.csv
    full_path = os.path.join(results_dir, args['projectName'])
    if not os.path.exists(full_path):
        return False
    class_csv = glob(os.path.join(full_path, '**', '*Class.csv'), recursive=True)
    method_csv = glob(os.path.join(full_path, '**', '*Method.csv'), recursive=True)
    if len(class_csv) > 0 and len(method_csv) > 0:
        return True
    else:
        return False

In [8]:
args = {
    'projectName': 'test',
    'platform': 'AnyCPU',
    'configuration': 'Debug',
    'resultsDir': 'Results',
    'runFxCop': 'false',
    'runUDM': 'false',
    'runRoslynAnalyzers': 'false',
    'runMetricHunter': 'false',
}

In [9]:
def clear_dotnet():
    os.system('taskkill /f /im dotnet.exe')

In [8]:
clear_dotnet()

In [11]:
from tqdm import tqdm
from glob import glob

def save_table(repo_table):
    repo_table.to_csv('repo_table_metrics_collected.csv', index=False)
    repo_table_html = repo_table.to_html()
    with open('repo_table_metrics_collected.html', 'w') as f:
        f.write(repo_table_html)

success_count = 0
semi_success_count = 0
is_null = repos['sourcemeter_result'].isna()
for i, row in repos.iterrows():
    print(f'{i}. Analyzing {row["repo_path"]}:', end=' ')
    if not is_null[i]:
        if row['sourcemeter_result'] == '1':
            print('Already semi-successfully analyzed', end=' ')
            semi_success_count += 1
        elif row['sourcemeter_result'] == '2':
            print('Already successfully analyzed', end=' ')
            success_count += 1
        else:
            print('Already failed to analyze at verification step', end=' ')
        print(f'| S[{success_count}/{len(repos)}] sS[{semi_success_count}/{len(repos)}]')
        continue
    try:
        repo_path = row['repo_path']
        sln_count = 0
        sln_path = None
        for sln_p in glob(os.path.join(repo_path, '**', '*.sln'), recursive=True):
            sln_count += 1
            sln_path = sln_p
        if sln_count == 1:
            args['projectName'] = row['name'].replace('/', '_')
            result = call_sourcemeter(sln_path, args)
            if verify_sourcemeter_result(args):
                if result == 0:
                    print(f'Successfully analyzed!', end=' ')
                    repos.loc[i, 'sourcemeter_result'] = '2'
                    success_count += 1
                if result != 0:
                    print(f'Semi-successfully analyzed!', end=' ')
                    repos.loc[i, 'sourcemeter_result'] = '1'
                    semi_success_count += 1
            else:
                print(f'Failed to analyze at verification step', end=' ')
                repos.loc[i, 'sourcemeter_result'] = '0'
            save_table(repos)
        elif sln_count > 1:
            print(f'More than one solution file found', end=' ')
        else:
            print(f'No solution file found', end=' ')
    except Exception as e:
        print(f'Exception {e}', end=' ')
    print(f'| S[{success_count}/{len(repos)}] sS[{semi_success_count}/{len(repos)}]')
    if i % 20 == 0:
        clear_dotnet()

save_table(repos)
print(f'\nSuccessfully analyzed {success_count}/{len(repos)} repos')
print(f'Semi-successfully analyzed {semi_success_count}/{len(repos)} repos')

0. Analyzing D:\DataCollection\repos\PeterWaher_IoTGateway: More than one solution file found | S[0/790] sS[0/790]
1. Analyzing D:\DataCollection\repos\allenai_ai2thor: No solution file found | S[0/790] sS[0/790]
2. Analyzing D:\DataCollection\repos\Realm667_WolfenDoom: Already failed to analyze at verification step | S[0/790] sS[0/790]
3. Analyzing D:\DataCollection\repos\Unity-Technologies_Graphics: No solution file found | S[0/790] sS[0/790]
4. Analyzing D:\DataCollection\repos\UnityTechnologies_open-project-1: No solution file found | S[0/790] sS[0/790]
5. Analyzing D:\DataCollection\repos\Azure_azure-sdk-for-net: More than one solution file found | S[0/790] sS[0/790]
6. Analyzing D:\DataCollection\repos\msawczyn_EFDesigner: More than one solution file found | S[0/790] sS[0/790]
7. Analyzing D:\DataCollection\repos\egordorichev_BurningKnight: Already successfully analyzed | S[1/790] sS[0/790]
8. Analyzing D:\DataCollection\repos\AitorSimona_Traverser: No solution file found | S[1/7

In [None]:
clear_dotnet()

In [27]:
(repos['sourcemeter_result'] == '2').sum()

118

In [11]:
# verify again all successful repos
from tqdm import tqdm
from glob import glob

success_count = 0
successfull_repos = repos[repos['sourcemeter_result'] == '2']
for i, row in tqdm(successfull_repos.iterrows(), total=len(successfull_repos)):
    project_name = row['name'].replace('/', '_')
    args['projectName'] = project_name
    if verify_sourcemeter_result(args):
        success_count += 1
    else:
        print(f'Failed to verify {project_name}')

clear_dotnet()
print(f'Successfully verified {success_count}/{len(successfull_repos)} repos')

100%|██████████| 118/118 [00:00<00:00, 388.14it/s]

Successfully verified 118/118 repos





In [23]:
def get_souremeter_results_path(project_name, class_pattern='*Class.csv', method_pattern='*Method.csv'):
    results_dir = '.\\Results'
    # check if in results/{projectName} there is a file called *Class.csv and *Method.csv
    full_path = os.path.join(results_dir, project_name)
    if not os.path.exists(full_path):
        return False
    class_csv = glob(os.path.join(full_path, '**', class_pattern), recursive=True)
    method_csv = glob(os.path.join(full_path, '**', method_pattern), recursive=True)
    return class_csv, method_csv

Copy *Class.csv and *Method.csv of all results to different folders

In [26]:
import shutil
root_dir = '.\\CollectedMetrics'
os.mkdir(root_dir)

for i, row in tqdm(successfull_repos.iterrows(), total=len(successfull_repos)):
    project_name = row['name'].replace('/', '_')
    os.mkdir(os.path.join(root_dir, project_name))
    class_csv, method_csv = get_souremeter_results_path(project_name, class_pattern=f'{project_name}-Class.csv', method_pattern=f'{project_name}-Method.csv')
    # sort by date and pick latest
    class_csv.sort(key=os.path.getmtime)
    method_csv.sort(key=os.path.getmtime)
    if len(class_csv) > 0 and len(method_csv) > 0:
        shutil.copy(class_csv[-1], os.path.join(root_dir, project_name, f'{project_name}-Class.csv'))
        shutil.copy(method_csv[-1], os.path.join(root_dir, project_name, f'{project_name}-Method.csv'))
    else:
        print(f'Failed to find {project_name}')

100%|██████████| 118/118 [00:02<00:00, 54.81it/s]
