In [10]:
from github import Github
from github import Auth
import pandas as pd

In [11]:
access_token = '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'  # Hana's token

auth = Auth.Token(access_token)
g = Github(auth=auth)
login = g.get_user().login

In [12]:
data_file = '/Users/yelderiny/Projects/Dissertation/Data/project-data3.csv'

In [13]:
mit = g.search_repositories(query="license:mit")
repos = list(mit)

In [14]:
try:
    existing_data = pd.read_csv(data_file)
except pd.errors.EmptyDataError:
    existing_data = pd.DataFrame()

In [15]:
equal_weights_pr = [('commits', 0.25), ('additions', 0.25), ('deletions', 0.25), ('changed_files', 0.25)]
weight_split1 = [('commits', 0.2), ('additions', 0.4), ('deletions', 0.3), ('changed_files', 0.1)]
weight_split2 = [('commits', 0.1), ('additions', 0.3), ('deletions', 0.2), ('changed_files', 0.4)]

pr_weight_splits = [equal_weights_pr, weight_split1, weight_split2]

In [16]:
equal_weights_xp = [('contributions', 0.333), ('followers', 0.333), ('public_repos', 0.333)]
weight_split3 = [('contributions', 0.4), ('followers', 0.3), ('public_repos', 0.3)]
weight_split4 = [('contributions', 0.5), ('followers', 0.35), ('public_repos', 0.15)]

con_xp_splits = [equal_weights_xp, weight_split3, weight_split4]

In [17]:
def normalize_features(raw_feature, min_value, max_value):
    return 0.5 if min_value == max_value else (raw_feature - min_value) / (max_value - min_value)


def assign_points(obj, dict, pr: bool):
    try:
        weight_splits = pr_weight_splits if pr else con_xp_splits
        return [sum(weight * normalize_features(getattr(obj, key), dict[key][0], dict[key][1]) for key, weight in split)
                for split in weight_splits]
    except Exception as e:
        print(f'Error processing {'pull request' if pr else 'object'} {obj.url}: {e}')
        return [0, 0, 0]


def get_points(objs, my_dict, pr: bool):
    return [round(sum(points[i] for points in (assign_points(obj, my_dict, pr) for obj in objs)), 3) for i in range(3)]

In [18]:
for repo in repos[200:]:
    try:
        if not existing_data.empty and repo.full_name in existing_data['name'].values:
            print(f'Skipping processing for repo: {repo.full_name}')
            print('-' * 30)
            continue
            
        if repo.language not in ['Java', 'Python', 'C++', 'Go', 'JavaScript', 'TypeScript', 'Swift']:
            print('Foreign language')
            print(f'Skipping processing for repo: {repo.full_name}')
            print('-' * 30)
            continue

        # Get the repo pull requests
        print(f'Processing {repo.full_name}')

        pr_pages = repo.get_pulls(state='closed', base='master', sort='desc')
        contributor_pages = repo.get_contributors()

        print(f'Number of pull request: {pr_pages.totalCount}')
        print(f'Number of contributors: {contributor_pages.totalCount}')

        if pr_pages.totalCount == 0:
            print(f'No pull requests in this repo')
            print('-' * 30)
            continue

        pull_requests = list(pr_pages)
        contributors = list(contributor_pages)

        pr_features = {
            "commits": (float('inf'), float('-inf')),
            "additions": (float('inf'), float('-inf')),
            "deletions": (float('inf'), float('-inf')),
            "changed_files": (float('inf'), float('-inf'))
        }

        con_features = {
            "contributions": (float('inf'), float('-inf')),
            "followers": (float('inf'), float('-inf')),
            "public_repos": (float('inf'), float('-inf')),
        }
        
        pull_requests = [pr for pr in pull_requests if pr.merged and (pr.additions or pr.deletions)]
        
        for pr in pull_requests:
            for key, value in pr_features.items():
                pr_features[key] = (min(getattr(pr, key), value[0]), max(getattr(pr, key), value[1]))

        print(pr_features)
        print(f'Filtered pull requests: {len(pull_requests)}')
        
        if len(pull_requests) < 5:
            print(f'Not enough pull requests to process')
            print('-' * 30)
            continue
                
        for con in contributors:
            for key, value in con_features.items():
                con_features[key] = (min(getattr(con, key), value[0]), max(getattr(con, key), value[1]))
        
        print(con_features)

        pr_points1, pr_points2, pr_points3 = get_points(pull_requests, pr_features, pr=True)
        xp_points1, xp_points2, xp_points3 = get_points(contributors, con_features, pr=False)

        print(f'Pull Request Points: {pr_points1}, {pr_points2}, {pr_points3}')
        print(f'Contributor Points: {xp_points1}, {xp_points2}, {xp_points3}')

        print('Adding data to dataframe')
        new_data = {
            'name': repo.full_name,
            'language': repo.language,
            'pull_requests': len(pull_requests),
            'size': repo.size,
            'contributors': repo.get_contributors().totalCount,
            'age': (pull_requests[-1].merged_at - repo.created_at).days,
            'contributor_xp1': xp_points1,
            'contributor_xp2': xp_points2,
            'contributor_xp3': xp_points3,
            'pr_points1': pr_points1,
            'pr_points2': pr_points2,
            'pr_points3': pr_points3
        }

        new_entry_df = pd.DataFrame([new_data])
        new_entry_df.to_csv(data_file, mode='a', header=existing_data.empty)

        existing_data = pd.concat([existing_data, new_entry_df], ignore_index=True)

        print('-' * 30)

    except Exception as e:
        print(f'Error processing repo {repo.full_name}: {e}')
        print('-' * 30)
        continue

Skipping processing for repo: kdn251/interviews
------------------------------
Processing xkcoding/spring-boot-demo
Number of pull request: 26
Number of contributors: 11
{'commits': (1, 4), 'additions': (2, 1103), 'deletions': (0, 3), 'changed_files': (1, 15)}
Filtered pull requests: 4
Not enough pull requests to process
------------------------------
Processing wuyouzhuguli/SpringAll
Number of pull request: 11
Number of contributors: 2
{'commits': (1, 1), 'additions': (1, 1), 'deletions': (1, 1), 'changed_files': (1, 1)}
Filtered pull requests: 1
Not enough pull requests to process
------------------------------
Skipping processing for repo: crossoverJie/JCSprout
------------------------------
Skipping processing for repo: linlinjava/litemall
------------------------------
Skipping processing for repo: shuzheng/zheng
------------------------------
Skipping processing for repo: winterbe/java8-tutorial
------------------------------
Skipping processing for repo: justauth/JustAuth
------

Request GET /repos/RyanFehr/HackerRank/pulls/227 failed with 403: Forbidden
Setting next backoff to 93.048577s


{'commits': (1, 28), 'additions': (1, 661), 'deletions': (0, 53), 'changed_files': (1, 14)}
Filtered pull requests: 141
{'contributions': (1, 514), 'followers': (2, 497), 'public_repos': (7, 284)}
Pull Request Points: 9.423, 9.683, 10.406
Contributor Points: 1.361, 1.364, 1.34
Adding data to dataframe
------------------------------
Processing proyecto26/react-native-inappbrowser
Number of pull request: 53
Number of contributors: 39
{'commits': (1, 21), 'additions': (1, 8219), 'deletions': (0, 6538), 'changed_files': (1, 28)}
Filtered pull requests: 45
{'contributions': (1, 300), 'followers': (0, 1397), 'public_repos': (0, 782)}
Pull Request Points: 3.0, 2.649, 3.049
Contributor Points: 2.859, 2.693, 2.366
Adding data to dataframe
------------------------------
Processing harjot-oberai/Croller
Number of pull request: 9
Number of contributors: 8
{'commits': (1, 10), 'additions': (1, 412), 'deletions': (0, 127), 'changed_files': (1, 17)}
Filtered pull requests: 6
{'contributions': (1, 43)