In [60]:
import pandas as pd
import os
from datetime import datetime
from pathlib import Path
from research_questions.configs.configs import Configs

from scipy.stats import pearsonr
from scipy.stats import ttest_ind
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, FuncFormatter, LogFormatter
import pingouin as pg
from scipy.stats import mannwhitneyu
import numpy as np

## Auxiliar Functions

In [61]:
def merge_csv_results(folder_split):


    csv_files = [file for file in os.listdir(folder_split) if file.endswith('.csv')]

    dataframes = []

    for csv_file in csv_files:
        file_path = os.path.join(folder_split, csv_file)
        df = pd.read_csv(file_path)
        print(file_path)
        print(len(df))
        dataframes.append(df)

    merged_df = pd.concat(dataframes, ignore_index=True)

    merged_df.to_csv(f'contributors_data_{folder_split}.csv', index=False)

    return merged_df



def get_mann_test_and_cohen_d(df_1, df_2, column_name):
    """Applies Mann-Whitney hipothesis testing to 2 group sets
    passed as input (2 dataframe columns passed as parameter).
    Also gets Cohen D coefficient between these 2 groups,
    that is a effect size measure if two groups have similar 
    standard deviations and are of the same size 
    """
    df_1 = df_1.dropna(subset=[column_name])
    df_2 = df_2.dropna(subset=[column_name])

    statistic, p_value = mannwhitneyu(df_1[column_name], df_2[column_name])
    print(f"Mann-Whitney U statistic: {statistic}")
    # printing more decimals:
    formatted_p_value = "{:.10f}".format(p_value)
    print(f"P-value: {formatted_p_value}")
    
    # returning the absolute value of cohen d:
    cohen_d = pg.compute_effsize(df_1[column_name], df_2[column_name], eftype='cohen')
    cohen_d = np.abs(cohen_d)
    print(f"Absolute value of cohen d: {cohen_d}")
    if cohen_d <= 0.2:
        print(f"Very small difference between Notebooks vs Python (cohen D <= 0.2)")
    if cohen_d > 0.2 and cohen_d <= 0.5:
        print(f"Small difference between Notebooks vs Python ( 0.2 > cohen D <= 0.5)")
    if cohen_d > 0.5 and cohen_d <= 0.8:
        print(f"Medium difference between Notebooks vs Python ( 0.5 > cohen D <= 0.8)")
    if cohen_d > 0.8:
        print(f"Big difference between Notebooks vs Python (cohen D > 0.8)")



Choosing the current datetime to be 04/27/2024 (as set in configs.json file):

In [62]:
current_time = '2024-04-27'

In [63]:
def get_account_age(account_creation_datetime, current_time):
    
    account_creation_date = datetime.strptime(
        account_creation_datetime, '%Y-%m-%dT%H:%M:%SZ')
    
    current_datetime = datetime.strptime(
        current_time, '%Y-%m-%d')
    
    elapsed_years = current_datetime.year - account_creation_date.year
    
    
    return elapsed_years

In [64]:
def get_number_contributors_per_repo(df, split, print_results=True):

    distinct_user_counts = df.groupby('repo_url')['user_id'].nunique()
    
    # Convert the series to a DataFrame
    df_num_contributors_per_repo = distinct_user_counts.reset_index()

    # Rename the columns for clarity
    df_num_contributors_per_repo.columns = ['repo_url', 'num_contributors']
    df_num_contributors_per_repo['split'] = split

    average_distinct_users = distinct_user_counts.mean()
    max_distinct_users = distinct_user_counts.max()
    min_distinct_users = distinct_user_counts.min()
    median_distinct_users = distinct_user_counts.median()

    max_repo_url = distinct_user_counts.idxmax()

    if print_results:
        print(f"Average number of contributors per repository: {average_distinct_users}")
        print(f"Maximum number of contributors per repository: {max_distinct_users} ({max_repo_url})")
        print(f"Minimum number of contributors per repository: {min_distinct_users}")
        print(f"Median number of contributors per repository: {median_distinct_users}")

    contributors_info = {
        "avg_unique_contributors": average_distinct_users,
        "median_contributors": median_distinct_users,
        "max_contributors":  max_distinct_users,
        "min_contributors": min_distinct_users,
        "total_contributors": len(df)
        }


    return contributors_info, df_num_contributors_per_repo

Functions to generate data summary:

In [65]:
def get_statistics_age(df, name):

    
    desc = df.describe()
    stats = {
        'split' : name,
        'avg_age': desc.loc['mean', 'account_age'],
        'median_age': df['account_age'].median(),
        'max_age': df['account_age'].max(),
        'min_age': df['account_age'].min(),
        'total_contributors': desc.loc['count', 'account_age'],
    }
    return stats

def get_summary_age(df_list): 

    all_stats = []

    for item in df_list:
        df = item['data']
        name = item['split_name']
        stats = get_statistics_age(df,name)
        all_stats.append(stats)


    combined_stats = pd.DataFrame(all_stats)

    return combined_stats

In [66]:
def get_statistics_contributors(df, name):

    
    stats, _ = get_number_contributors_per_repo(df=df, print_results=False, split='optional')
    stats['split']=name
    
    return stats

def get_summary_contributors(df_list): 
    
    all_stats = []

    for item in df_list:
        
        df = item['data']
        name = item['split_name']
        
        stats = get_statistics_contributors(df,name)
        
        all_stats.append(stats)


    combined_stats = pd.DataFrame(all_stats)

    return combined_stats

## Analysis of the SE split:

In [67]:
df_contributors_SE = merge_csv_results('SE')

SE/contributors_SE_250_to_377.csv
4723
SE/contributors_SE_0_to_250.csv
6191


Total number of contributors of the Notebook data split for Software Engineering purposes:

In [68]:
len(df_contributors_SE)

10914

Number of unique repository urls in the Notebook SE split:

In [69]:
df_contributors_SE['repo_url'].nunique()

375

In [70]:
df_contributors_SE['account_age'] = df_contributors_SE['created_at'].apply(
    lambda x: get_account_age(x, current_time))

### Average year account of contributors for the SE split:

Statistics (mean, median, quartiles, standard deviation, etc) for the ages of experience of developers for the Notebook dataset split for Software Engineering purposes:

('count', below, is the number of unqiue contributors of the Notebook dataset split for Software Engineering purposes)

In [71]:
df_contributors_SE['account_age'].describe()

count    10914.000000
mean         8.969947
std          3.448408
min          0.000000
25%          6.000000
50%          9.000000
75%         12.000000
max         16.000000
Name: account_age, dtype: float64

### Unique contributors per repository for the Notebook dataset split for Software Engineering purposes:


In [72]:
_, contributors_per_repo_SE = get_number_contributors_per_repo(df_contributors_SE, split='Notebook (SE)')

Average number of contributors per repository: 29.104
Maximum number of contributors per repository: 406 (https://github.com/GoogleCloudPlatform/python-docs-samples)
Minimum number of contributors per repository: 1
Median number of contributors per repository: 8.0


## Location of contributors  for the Notebook dataset split for Software Engineering purposes:



('count', below, is the number of unqiue contributors of the Notebook dataset split for Software Engineering purposes)

In [73]:
df_SE_location = pd.read_csv("country_contributors_SE.csv")
len(df_SE_location)

10914

Saving the number of contributors by location, on a txt file, for the Notebook dataset split for Software Engineering purposes

In [74]:
with open('location_SE.txt', 'w') as f:
    for key,item in df_SE_location['location'].value_counts().items():
        print(key, item, file=f)
        

Number of contributors, by country, for the Notebook dataset split for Software Engineering purposes

In [75]:
df_SE_location['country'].value_counts()

country
absent location in GitHub    4893
United States                2415
Germany                       373
United Kingdom                356
India                         318
                             ... 
Croatia                         1
Costa Rica                      1
Sudan                           1
Panama                          1
North Macedonia                 1
Name: count, Length: 107, dtype: int64

Saving the number of contributors by country for the Notebook dataset split for Software Engineering purposes:

In [76]:
with open('country_SE.txt', 'w') as f:
    for key,item in df_SE_location['country'].value_counts().items():
        print(key, item, file=f)
        

# Analysis of the Notebook data split for educational purposes:

In [77]:
df_contributors_non_SE = merge_csv_results('non_SE')

non_SE/contributors_non_SE_250_to_526.csv
5868
non_SE/contributors_non_SE_0_to_250.csv
5544


Number of unique contributors of the Notebook dataset split for educational purposes:

In [78]:
len(df_contributors_non_SE)

11412

Number of unique urls of the Notebook dataset split for educational purposes:

In [79]:
df_contributors_non_SE['repo_url'].nunique()

517

In [80]:
df_contributors_non_SE['account_age'] = df_contributors_non_SE['created_at'].apply(
    lambda x: get_account_age(x, current_time))

### Average year account of contributors for the  Notebook dataset split for educational purposes:

('count', below, is the number of unqiue contributors of the Notebook dataset split for educational purposes):

In [81]:
df_contributors_non_SE['account_age'].describe()

count    11412.000000
mean         7.682878
std          3.440256
min          0.000000
25%          5.000000
50%          7.000000
75%         10.000000
max         16.000000
Name: account_age, dtype: float64

### Contributors per repository for the Notebook dataset split for educational purposes:

In [82]:
_,contributors_per_repo_non_SE = get_number_contributors_per_repo(df_contributors_non_SE, split="Notebook (Edu)")

Average number of contributors per repository: 22.073500967117987
Maximum number of contributors per repository: 477 (https://github.com/ossamamehmood/Hacktoberfest2022)
Minimum number of contributors per repository: 1
Median number of contributors per repository: 5.0


## Location of contributors for the Notebook dataset split for educational purposes:



Number of unique contributors for the  Notebook dataset split for educational purposes:

In [83]:
df_non_SE_location = pd.read_csv("country_contributors_non_SE.csv")
len(df_non_SE_location)

11412

Location of contributors  for the Notebook dataset split for educational purposes:

In [84]:
df_non_SE_location['location'].value_counts()

location
India              334
London              86
Seattle, WA         79
San Francisco       73
Bangalore           63
                  ... 
Denizli, Turkey      1
Ankara / TURKEY      1
Izmir,TURKEY         1
TURKEY               1
Sanremo              1
Name: count, Length: 2316, dtype: int64

Saving the number of contributors by location for the Notebook dataset split for educational  purposes:

In [85]:
with open('location_non_SE.txt', 'w') as f:
    for key,item in df_non_SE_location['location'].value_counts().items():
        print(key, item, file=f)
        

Number of contributors by country for the Notebook dataset split for educational purposes

In [86]:
df_non_SE_location['country'].value_counts()

country
absent location in GitHub    4820
United States                1685
India                        1445
China                         402
Germany                       273
                             ... 
Guatemala                       1
Iceland                         1
Cuba                            1
Jamaica                         1
Lebanon                         1
Name: count, Length: 113, dtype: int64

Saving the number of contributors by country for the Notebook dataset split for educational purposes

In [87]:
with open('country_non_SE.txt', 'w') as f:
    for key,item in df_non_SE_location['country'].value_counts().items():
        print(key, item, file=f)
        

## Analysis of the SE split for python repositories:

In [88]:
df_contributors_SE_py = merge_csv_results('SE_py')

SE_py/contributors_SE_py_500_to_608.csv
4281
SE_py/contributors_SE_py_250_to_500.csv
8343
SE_py/contributors_SE_py_0_to_250.csv
8571


Total number of unique contributors of the Python dataset split for Software Engineering purposes

In [89]:
len(df_contributors_SE_py)

21195

Number of unique urls of the Python dataset split for Software Engineering purposes:

In [90]:
df_contributors_SE_py['repo_url'].nunique()

606

In [91]:
df_contributors_SE_py['account_age'] = df_contributors_SE_py['created_at'].apply(
    lambda x: get_account_age(x, current_time))

### Average year account of contributors for the SE split for python repositories:

Statistics (mean, median, quartiles, standard deviation, etc) for the ages of experience of developers for the Python dataset split for Software Engineering purposes:

In [92]:
df_contributors_SE_py['account_age'].describe()

count    21195.000000
mean        10.122906
std          3.572810
min          0.000000
25%          8.000000
50%         11.000000
75%         13.000000
max         16.000000
Name: account_age, dtype: float64

### Unique Contributors per repository for the Python dataset split for Software Engineering purposes:

In [93]:
_, contributors_per_repo_SE_py = get_number_contributors_per_repo(df_contributors_SE_py, 
                                                                  split='Python (SE)')

Average number of contributors per repository: 34.975247524752476
Maximum number of contributors per repository: 426 (https://github.com/duneanalytics/spellbook)
Minimum number of contributors per repository: 1
Median number of contributors per repository: 13.0


## Location of contributors for the Python dataset split for Software Engineering purposes:

In [94]:
df_SE_py_location = pd.read_csv("country_contributors_SE_py.csv")
len(df_SE_py_location)

21195

Number of contributors by location for the Python dataset split for Software Engineering purposes

In [95]:
df_SE_py_location['location'].value_counts()

location
Germany                      245
San Francisco, CA            161
France                       159
London, UK                   144
London                       139
                            ... 
Derby, UK                      1
<script>alert(1)</script>      1
Lost in the Web                1
Québec City                    1
Shaanxi Province               1
Name: count, Length: 3545, dtype: int64

Saving the number of contributors by location, on a txt file, for the Python dataset split for Software Engineering purposes

In [96]:
with open('location_SE_py.txt', 'w') as f:
    for key,item in df_SE_py_location['location'].value_counts().items():
        print(key, item, file=f)
        

Number of contributors by country for the Python dataset split for Software Engineering purposes

In [97]:
df_SE_py_location['country'].value_counts()

country
absent location in GitHub    8625
United States                3527
Germany                       962
China                         721
United Kingdom                721
                             ... 
Malta                           1
Yemen                           1
Angola                          1
Sierra Leone                    1
Brunei                          1
Name: count, Length: 129, dtype: int64

Saving the number of contributors by country for the Python dataset split for Software Engineering purposes

In [98]:
with open('country_SE_py.txt', 'w') as f:
    for key,item in df_SE_py_location['country'].value_counts().items():
        print(key, item, file=f)
        

## Analysis of the non SE split for python repositories:

In [99]:
df_contributors_non_SE_py = merge_csv_results('non_SE_py')

non_SE_py/contributors_non_SE_py_0_to_110.csv
2034


Unique contributors per repository for the Python dataset split for educational purposes

In [100]:
len(df_contributors_non_SE_py)

2034

Number of GitHub urls for the Python dataset split for educational purposes

In [101]:
df_contributors_non_SE_py['repo_url'].nunique()

109

In [102]:
df_contributors_non_SE_py['account_age'] = df_contributors_non_SE_py['created_at'].apply(
    lambda x: get_account_age(x, current_time))

### Average year account of contributors for the Python dataset split for educational purposes

Statistics (mean, median, quartiles, standard deviation, etc) for the ages of experience of developers for the Python dataset split for educational purposes

In [103]:
df_contributors_non_SE_py['account_age'].describe()

count    2034.000000
mean        8.546214
std         3.553827
min         0.000000
25%         6.000000
50%         8.000000
75%        11.000000
max        16.000000
Name: account_age, dtype: float64

### Contributors per repository for the Python dataset split for educational purposes:

In [104]:
_, contributors_per_repo_non_SE_py = get_number_contributors_per_repo(df_contributors_non_SE_py,
                                                                      split='Python (Educational)')

Average number of contributors per repository: 18.660550458715598
Maximum number of contributors per repository: 318 (https://github.com/Ishaan28malik/Hacktoberfest-2023)
Minimum number of contributors per repository: 1
Median number of contributors per repository: 4.0


## Location of contributors for the Python dataset split for educational purposes:

In [105]:
df_non_SE_py_location = pd.read_csv("country_contributors_non_SE_py.csv")
len(df_non_SE_py_location)

2034

 Location of contributors for the Python dataset split for educational purposes

In [106]:
df_non_SE_py_location['location'].value_counts()

location
India                           44
Seattle, WA                     16
Beijing, China                  15
France                          15
United States                   13
                                ..
Herndon, VA                      1
Seattle, Washington              1
NL                               1
London, UK / Porto, Portugal     1
Peking                           1
Name: count, Length: 720, dtype: int64

Saving the number of contributors by location in a txt file for the Python dataset split for educational purposes

In [107]:
with open('location_non_SE_py.txt', 'w') as f:
    for key,item in df_non_SE_py_location['location'].value_counts().items():
        print(key, item, file=f)
        

Number of contributors by country in a txt file for the Python dataset split for educational purposes

In [108]:
df_non_SE_py_location['country'].value_counts()

country
absent location in GitHub    763
United States                299
India                        177
China                        110
United Kingdom                66
                            ... 
Luxembourg                     1
Bangladesh                     1
Côte d'Ivoire                  1
Montenegro                     1
Malawi                         1
Name: count, Length: 78, dtype: int64

 Saving the number of contributors by location in a txt file for the Python dataset split for educational purposes

In [109]:
with open('country_non_SE_py.txt', 'w') as f:
    for key,item in df_non_SE_py_location['country'].value_counts().items():
        print(key, item, file=f)
        

## Summary

In [110]:
dataframes_info = [
    {'data': df_contributors_SE, 'split_name': 'SE_nb'},
    {'data': df_contributors_non_SE, 'split_name': 'non_SE_nb'},
    {'data': df_contributors_SE_py, 'split_name': 'SE_py'},
    {'data': df_contributors_non_SE_py, 'split_name': 'non_SE_py'},
]



In [111]:
get_summary_age(dataframes_info)

Unnamed: 0,split,avg_age,median_age,max_age,min_age,total_contributors
0,SE_nb,8.969947,9.0,16,0,10914.0
1,non_SE_nb,7.682878,7.0,16,0,11412.0
2,SE_py,10.122906,11.0,16,0,21195.0
3,non_SE_py,8.546214,8.0,16,0,2034.0


In [112]:
get_summary_contributors(dataframes_info)

Unnamed: 0,avg_unique_contributors,median_contributors,max_contributors,min_contributors,total_contributors,split
0,29.104,8.0,406,1,10914,SE_nb
1,22.073501,5.0,477,1,11412,non_SE_nb
2,34.975248,13.0,426,1,21195,SE_py
3,18.66055,4.0,318,1,2034,non_SE_py


In [113]:
dev_experience_notebook = pd.concat([df_contributors_SE,
                                 df_contributors_non_SE])

dev_experience_python = pd.concat([df_contributors_SE_py,
                               df_contributors_non_SE_py])

num_contributors_notebook = pd.concat([contributors_per_repo_SE,
                                      contributors_per_repo_non_SE])


num_contributors_python = pd.concat([contributors_per_repo_SE_py,
                                      contributors_per_repo_non_SE_py])

## Statistical Tests

Using Mann-Whitney test to observe if there are any statistical significance between Notebook vs Python repositories

Statistical significance in developers years of experience between Notebooks vs Python repositories:

In [114]:
get_mann_test_and_cohen_d(dev_experience_notebook, dev_experience_python, 'account_age')

Mann-Whitney U statistic: 189044472.0
P-value: 0.0000000000
Absolute value of cohen d: 0.4708736422264135
Small difference between Notebooks vs Python ( 0.2 > cohen D <= 0.5)


Statistical significance in number of unique contributors (team size) between Notebooks vs Python repositories:

In [115]:
get_mann_test_and_cohen_d(num_contributors_notebook, num_contributors_python, 'num_contributors')

Mann-Whitney U statistic: 262574.5
P-value: 0.0000000010
Absolute value of cohen d: 0.12742558505136758
Very small difference between Notebooks vs Python (cohen D <= 0.2)
