In [89]:
import pandas as pd
import os
from datetime import datetime
from pathlib import Path
from research_questions.configs.configs import Configs

## Auxiliar Functions

In [90]:
def merge_csv_results(folder_split):


    csv_files = [file for file in os.listdir(folder_split) if file.endswith('.csv')]

    dataframes = []

    for csv_file in csv_files:
        file_path = os.path.join(folder_split, csv_file)
        df = pd.read_csv(file_path)
        print(file_path)
        print(len(df))
        dataframes.append(df)

    merged_df = pd.concat(dataframes, ignore_index=True)

    merged_df.to_csv(f'repo_info_{folder_split}.csv', index=False)

    return merged_df

In [91]:
def get_statistics(df, name):
    """Used only to print a summary of mean, median and total count for
    stars, forks, watchers and open issues."""
    
    desc = df.describe()
    stats = {
        'split' : name,
        'avg_stars': desc.loc['mean', 'stargazers'],
        'avg_watchers': desc.loc['mean', 'watchers'],
        'avg_forks': desc.loc['mean', 'forks'],
        'avg_issues': desc.loc['mean', 'open_issues'],
        'median_stars': df['stargazers'].median(),
        'median_watchers': df['watchers'].median(),
        'median_forks': df['forks'].median(),
        'median_issues': df['open_issues'].median(),
        'total_repos': len(df)
    }
    return stats

def get_summary(df_list): 

    all_stats = []

    for item in df_list:
        df = item['data']
        name = item['split_name']
        stats = get_statistics(df,name)
        all_stats.append(stats)


    combined_stats = pd.DataFrame(all_stats)

    return combined_stats

## Analysis for the SE purpose repositories (notebooks):

'count' means the number of total observations (in this case, the number of repos)

In [92]:
df_SE = merge_csv_results('SE')
len(df_SE)

SE/repo_info_SE_0_to_100.csv
100
SE/repo_info_SE_100_to_300.csv
199
SE/repo_info_SE_300_to_376.csv
76


375

The stars are equal to watchers
https://github.com/orgs/community/discussions/24795

should we change it to 'subscribers'?

In [93]:
df_SE['stargazers'].describe()

count      375.000000
mean      3337.874667
std       6929.030706
min        603.000000
25%        910.000000
50%       1377.000000
75%       3107.500000
max      76738.000000
Name: stargazers, dtype: float64

In [94]:
df_SE['watchers'].describe()

count     375.000000
mean       87.488000
std       174.675587
min         7.000000
25%        26.000000
50%        44.000000
75%        83.000000
max      2741.000000
Name: watchers, dtype: float64

In [95]:
df_SE['forks'].describe()

count      375.000000
mean       774.456000
std       2622.426783
min         15.000000
25%        173.500000
50%        295.000000
75%        559.500000
max      45856.000000
Name: forks, dtype: float64

In [96]:
df_SE['open_issues'].describe()

count     375.000000
mean       91.608000
std       244.751764
min         0.000000
25%        13.000000
50%        33.000000
75%        76.500000
max      3457.000000
Name: open_issues, dtype: float64

## Analysis for the non SE purpose repositories (notebooks):

'count' means the number of total observations (in this case, the number of repos)

In [97]:
df_non_SE = merge_csv_results('non_SE')
len(df_non_SE)

non_SE/repo_info_non_SE_0_to_250.csv
249
non_SE/repo_info_non_SE_250_to_526.csv
275


524

The stars are equal to watchers
https://github.com/orgs/community/discussions/24795

should we change it to 'subscribers'?

In [98]:
df_non_SE['stargazers'].describe()

count      524.000000
mean      3810.137405
std       6563.381568
min         93.000000
25%        909.500000
50%       1587.500000
75%       3641.250000
max      67708.000000
Name: stargazers, dtype: float64

In [99]:
df_non_SE['watchers'].describe()

count     524.000000
mean      156.557252
std       258.470332
min         2.000000
25%        37.000000
50%        69.000000
75%       151.750000
max      2709.000000
Name: watchers, dtype: float64

In [100]:
df_non_SE['forks'].describe()

count      524.000000
mean      1596.656489
std       4244.250459
min          0.000000
25%        331.000000
50%        639.000000
75%       1534.250000
max      83856.000000
Name: forks, dtype: float64

In [101]:
df_non_SE['open_issues'].describe()

count     524.000000
mean       38.494275
std       165.071460
min         0.000000
25%         2.000000
50%         9.000000
75%        26.000000
max      2935.000000
Name: open_issues, dtype: float64

## Analysis for the non SE purpose repositories (notebooks):

'count' means the number of total observations (in this case, the number of repos)

In [102]:
df_non_SE = merge_csv_results('non_SE')
len(df_non_SE)

non_SE/repo_info_non_SE_0_to_250.csv
249
non_SE/repo_info_non_SE_250_to_526.csv
275


524

The stars are equal to watchers
https://github.com/orgs/community/discussions/24795

should we change it to 'subscribers'?

In [103]:
df_non_SE['stargazers'].describe()

count      524.000000
mean      3810.137405
std       6563.381568
min         93.000000
25%        909.500000
50%       1587.500000
75%       3641.250000
max      67708.000000
Name: stargazers, dtype: float64

In [104]:
df_non_SE['watchers'].describe()

count     524.000000
mean      156.557252
std       258.470332
min         2.000000
25%        37.000000
50%        69.000000
75%       151.750000
max      2709.000000
Name: watchers, dtype: float64

In [105]:
df_non_SE['forks'].describe()

count      524.000000
mean      1596.656489
std       4244.250459
min          0.000000
25%        331.000000
50%        639.000000
75%       1534.250000
max      83856.000000
Name: forks, dtype: float64

In [106]:
df_non_SE['open_issues'].describe()

count     524.000000
mean       38.494275
std       165.071460
min         0.000000
25%         2.000000
50%         9.000000
75%        26.000000
max      2935.000000
Name: open_issues, dtype: float64

## Analysis for the SE purpose repositories (python):

'count' means the number of total observations (in this case, the number of repos)

In [107]:
df_SE_py = merge_csv_results('SE_py')
len(df_SE_py)

SE_py/repo_info_SE_py_0_to_300.csv
300
SE_py/repo_info_SE_py_300_to_608.csv
308


608

The stars are equal to watchers
https://github.com/orgs/community/discussions/24795

should we change it to 'subscribers'?

In [108]:
df_SE_py['stargazers'].describe()

count      608.000000
mean      1973.425987
std       1646.158042
min        216.000000
25%        870.750000
50%       1524.000000
75%       2799.000000
max      11839.000000
Name: stargazers, dtype: float64

In [109]:
df_SE_py['watchers'].describe()

count    608.000000
mean      60.422697
std       54.023705
min        2.000000
25%       25.000000
50%       44.000000
75%       76.000000
max      342.000000
Name: watchers, dtype: float64

In [110]:
df_SE_py['forks'].describe()

count     608.000000
mean      339.601974
std       330.834225
min         9.000000
25%       123.000000
50%       239.000000
75%       429.750000
max      2629.000000
Name: forks, dtype: float64

In [111]:
df_SE_py['open_issues'].describe()

count     608.000000
mean       69.156250
std       105.983852
min         0.000000
25%        13.750000
50%        36.000000
75%        80.250000
max      1078.000000
Name: open_issues, dtype: float64

## Analysis for the non SE purpose repositories (python):

'count' means the number of total observations (in this case, the number of repos)

In [112]:
df_non_SE_py = merge_csv_results('non_SE_py')
len(df_non_SE_py)

non_SE_py/repo_info_non_SE_py_0_to_110.csv
110


110

The stars are equal to watchers
https://github.com/orgs/community/discussions/24795

should we change it to 'subscribers'?

In [113]:
df_non_SE_py['stargazers'].describe()

count     110.000000
mean     1583.072727
std      1179.049024
min       224.000000
25%       776.250000
50%      1177.500000
75%      2258.000000
max      6309.000000
Name: stargazers, dtype: float64

In [114]:
df_non_SE_py['watchers'].describe()

count    110.000000
mean      73.818182
std       78.117200
min        3.000000
25%       26.000000
50%       53.000000
75%       90.250000
max      494.000000
Name: watchers, dtype: float64

In [115]:
df_non_SE_py['forks'].describe()

count     110.000000
mean      431.190909
std       375.813600
min         9.000000
25%       186.000000
50%       353.500000
75%       584.000000
max      2223.000000
Name: forks, dtype: float64

In [116]:
df_non_SE_py['open_issues'].describe()

count    110.000000
mean      25.745455
std       64.507085
min        0.000000
25%        2.250000
50%        7.000000
75%       24.000000
max      480.000000
Name: open_issues, dtype: float64

## Summary of results

In [117]:
dataframes_info = [
    {'data': df_SE, 'split_name': 'SE_nb'},
    {'data': df_non_SE, 'split_name': 'Non_SE_nb'},
    {'data': df_SE_py, 'split_name': 'SE_py'},
    {'data': df_non_SE_py, 'split_name': 'non_SE_py'},
]

get_summary(dataframes_info)

Unnamed: 0,split,avg_stars,avg_watchers,avg_forks,avg_issues,median_stars,median_watchers,median_forks,median_issues,total_repos
0,SE_nb,3337.874667,87.488,774.456,91.608,1377.0,44.0,295.0,33.0,375
1,Non_SE_nb,3810.137405,156.557252,1596.656489,38.494275,1587.5,69.0,639.0,9.0,524
2,SE_py,1973.425987,60.422697,339.601974,69.15625,1524.0,44.0,239.0,36.0,608
3,non_SE_py,1583.072727,73.818182,431.190909,25.745455,1177.5,53.0,353.5,7.0,110
