# Code to find missing GH repos for advisories that are missing

* Requires you to run ecosystem_identify_missing_vfc_adv.ipynb
    * Generates: ./data/final_data/advisories_missing_GH_repo_20221204.csv
    * Prior cell to last cell in notebook

In [7]:
import pandas as pd
import requests
import time
import glob
import json
from xml.etree import ElementTree
from bs4 import BeautifulSoup

# Script to load current GHSA DB

* Make sure you have the GHSA DB locally cloned
    * set the ghsa_db_path to your cloned location

In [15]:
ghsa_db_path = f"../advisory-database/advisories/github-reviewed/"

# get all JSON files
ghsa_files = glob.glob(f"{ghsa_db_path}*/*/*/*.json")

df_ghsa = pd.DataFrame()

# load all json files to obtain the "PACKAGE" within the references list
for index, file in enumerate(ghsa_files):
    # placeholders for GHSA Info
    id = None
    package_ecosystem = None
    package_name = None
    references_package = None
    
    with open(file, 'r') as f:
        # load JSON
        temp_file = json.load(f)
        
        # set GHSA info
        id = temp_file['id']
        package_ecosystem = temp_file["affected"][0]["package"]["ecosystem"]
        package_name = temp_file["affected"][0]["package"]["name"]
        
        # check each reference for the package (source code) url
        for temp_ref in temp_file["references"]:
            if temp_ref["type"] == "PACKAGE":
                references_package = temp_ref["url"]
        
        # append to df_ghsa
        df_ghsa = pd.concat([df_ghsa, pd.DataFrame([[id, package_ecosystem,
                                                     package_name, references_package]],
                                                   columns=["id", "package_ecosystem",
                                                            "package_name", "references_package"])])
    
        f.close()


In [16]:
df_ghsa.head(n=20)

Unnamed: 0,id,package_ecosystem,package_name,references_package
0,GHSA-9272-59x2-gwf2,npm,ripedm160,
0,GHSA-8g64-9cm2-838j,npm,bugfer-xor,
0,GHSA-8q2c-2396-hf7j,npm,appx-compiler,
0,GHSA-5mm9-55c9-p5r7,npm,mogoose,
0,GHSA-48hw-37g6-3gw4,npm,mx-nested-menu,
0,GHSA-8gc6-65mm-xr6r,npm,bp66,
0,GHSA-whv6-rj84-2vh2,npm,nextcloud-vue-collections,https://github.com/juliushaertl/nextcloud-vue-...
0,GHSA-xwqw-rf2q-xmhf,npm,buefy,https://github.com/buefy/buefy
0,GHSA-77q4-m83q-w76v,npm,browserify-hmr,https://github.com/AgentME/browserify-hmr
0,GHSA-84qj-9qf2-q92r,npm,pm-controls,


In [26]:
# create a df for advisories with missing source code link
ghsa_missing_GH_repo = df_ghsa[df_ghsa['references_package'].isna()]

print(f"Unique GHSA Advisories loaded: {df_ghsa.id.nunique()}")
print(f"GHSA with a package (source code) link: {df_ghsa[df_ghsa['references_package'].notna()].id.nunique()}")
print(f"GHSA without a package (source code) link: {ghsa_missing_GH_repo.id.nunique()}\n")
print(f"Ecosystem breakdown of advisories missing source code link: \n"
      f"{ghsa_missing_GH_repo.package_ecosystem.value_counts()}")

Unique GHSA Advisories loaded: 10783
GHSA with a package (source code) link: 6205
GHSA without a package (source code) link: 4578

Ecosystem breakdown of advisories missing source code link: 
npm          1711
Maven        1364
PyPI          467
Packagist     337
RubyGems      261
Go            238
NuGet         166
crates.io      34
Name: package_ecosystem, dtype: int64


# MAVEN Finds

* API Guide from SonaType
    * https://central.sonatype.org/search/rest-api-guide/

In [None]:
def maven_pom_scm_check(groupId, artifactId, latest_version):
    """Check for SCM in Maven POM file

    Args:
        groupId (_type_): _description_
        artifactId (_type_): _description_
        latest_version (_type_): _description_

    Returns:
        _type_: _description_
    """
    # generate pom xml filepath from Maven search API
    url = f"https://search.maven.org/remotecontent?filepath={'/'.join(groupId.split('.'))}/{artifactId}/{latest_version}/{artifactId}-{latest_version}.pom"
    response = requests.get(url)
    # response.close()
    
    # holder value for SCM repo
    scm_repo = None
    
    try:
        # parse the XML response
        tree = ElementTree.fromstring(response.content)
                
        # iterate through children in tree
        for child in tree.findall('*'):
            # if SCM appear then set the scm_repo
            if 'scm' in child.tag:
                # find the child tag
                scm_tags = tree.findall(child.tag)
                # for each child tag
                for scm_tags_repo in scm_tags:
                    # find all repos in scm child tag
                    scm_repos = scm_tags_repo.findall('*')
                    # pull the text tag for the scm tag
                    for repos in scm_repos:
                        scm_repo = repos.text
                        
        response.close()
        return scm_repo   
    
    except Exception as e:
        print(f"Failure in file request: {url} | {str(e)}")
    

In [None]:
def maven_search(package_name):
    """Search for the Maven project name to see if it exists

    Args:
        package_name (_type_): _description_

    Returns:
        _type_: _description_
    """
    
    groupId = package_name.split(':')[0]
    artifactId = package_name.split(':')[1]
    
    # set url
    url = f"https://search.maven.org/solrsearch/select?q={groupId}+AND+a:{artifactId}&rows=10&wt=json"
    
    response = requests.get(url)
    response.close()
    
    if response.status_code == 200:
        temp = response.json()
        if temp["response"]["numFound"] >= 1:
            print(temp["response"]["numFound"])
            # obtain the latestVersion so we can obtain the pom.xml file
            for temp_response in temp['response']['docs']:
                # make sure the id matches the package_name for the search
                if temp_response['id'] == package_name:
                    latestVersion = temp_response['latestVersion']
    
                    # get pom.xml file
                    temp_scm_repo = maven_pom_scm_check(groupId, artifactId, latestVersion)
                    
                    print(f"{package_name} | repo={temp_scm_repo}")
                    
                    return temp_scm_repo
        else:
            print(f"{package_name} | 0 matches in search")
    else:
        print(f"{url} | Non-200 response")
        return None
    
    

In [None]:
maven_missing = ghsa_missing_GH_repo[ghsa_missing_GH_repo['package_ecosystem']=="Maven"]

maven_missing = maven_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

maven_missing = pd.read_csv(f"./data/final_data/missing_GH_info/advisories_maven_20221204.csv")

# testing = maven_missing[maven_missing['id']=='GHSA-r2pp-x4mm-4999'].reset_index(drop=True)

# maven_missing['github_repo'] = maven_missing.apply(
#     lambda x: maven_search(x['package_name']),
#     axis=1
# )

In [None]:
print(f"Found GH Repo: {maven_missing[maven_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {maven_missing[maven_missing['github_repo'].isna()].id.nunique()}")

# maven_missing.to_csv(f"./data/final_data/missing_GH_info/advisories_maven_20221204.csv", encoding='utf-8', index=False)

# NPM Finds

* We can directly hit the NPM registry at https://registry.npmjs.org/{package_name}
    * The response is a JSON that we can parse for the repository url

In [29]:
def npm_registry_source_code(package_name: str) -> str:
    """Obtain the repository git link

    Args:
        package_name (str): Target package name

    Returns:
        str: Git repo link
    """
    # set URL
    url = f"https://registry.npmjs.org/{package_name}"
    
    response = requests.get(url)
    response.close()
    
    if response.status_code == 200:
        temp = response.json()
        try:
            repo = temp['repository']['url']
            return repo
        except Exception as e:
            print(f"{url} | {str(e)}")
            return None
    else:
        print(f"{url} | Non-200 response")
        return None


In [30]:
npm_missing = ghsa_missing_GH_repo[ghsa_missing_GH_repo['package_ecosystem']=="npm"]

npm_missing = npm_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

print(f"Missing NPM: {npm_missing.id.nunique()}")

npm_missing['github_repo'] = npm_missing.apply(
    lambda x: npm_registry_source_code(x['package_name']),
    axis=1
)

Missing NPM: 1711
https://registry.npmjs.org/ripedm160 | 'repository'
https://registry.npmjs.org/myprolyz | 'repository'
https://registry.npmjs.org/serverhuwenhui | 'repository'
https://registry.npmjs.org/botbait | string indices must be integers
https://registry.npmjs.org/node-air-sdk | 'repository'
https://registry.npmjs.org/web3-eht | 'repository'
https://registry.npmjs.org/uekw1511server | 'repository'
https://registry.npmjs.org/wallet-address-validtaor | 'repository'
https://registry.npmjs.org/iie-viz | 'repository'
https://registry.npmjs.org/gfe-sass | 'repository'
https://registry.npmjs.org/dylmomo | 'repository'
https://registry.npmjs.org/sgqserve | 'repository'
https://registry.npmjs.org/oauth-validator | 'repository'
https://registry.npmjs.org/veval | 'repository'
https://registry.npmjs.org/serverlyr | 'repository'
https://registry.npmjs.org/serveryaozeyan | 'repository'
https://registry.npmjs.org/peiserver | 'repository'
https://registry.npmjs.org/wffserve | 'repository'
htt

KeyboardInterrupt: 

In [None]:
print(f"Found GH Repo: {npm_missing[npm_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {npm_missing[npm_missing['github_repo'].isna()].id.nunique()}")

# npm_missing.to_csv(f"./missing_GH_info/ghsa_npm_20230111.csv", encoding='utf-8', index=False)

# PyPI Finds

In [None]:
def get_soup(url):
    response = requests.get(url=url)  # request response from url
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


class GetPackageProjectLinks():
    def __init__(self, package_name):
        self.package_name = package_name
        self.home_page = None
        self.bug_tracker = None
        self.documentation = None
        self.source_code = None
        self.url = f"https://pypi.org/project/{self.package_name}/"

        soup = get_soup(self.url)
        table = soup.find_all("div", attrs={"class": "sidebar-section"})

        for each in table:
            for row in each.find_all("a"):
                if row.text.strip() == "Homepage":
                    self.home_page = row.get("href")
                if row.text.strip() == "Bug Tracker":
                    self.bug_tracker = row.get("href")
                if row.text.strip() == "Documentation":
                    self.documentation = row.get("href")
                if row.text.strip() == "Source":
                    self.source_code = row.get("href")

In [None]:
pypi_missing = advisories_missing_GH_repo[advisories_missing_GH_repo['ecosystem']=="PyPI"]

pypi_missing = pypi_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

pypi_missing["links"] = pypi_missing.apply(
    lambda x: GetPackageProjectLinks(package_name=x['package_name']),
    axis=1
)

# set source code
pypi_missing["source_code"] = pypi_missing.apply(
    lambda x: x['links'].source_code,
    axis=1
)

# set home page link
pypi_missing["home_page"] = pypi_missing.apply(
    lambda x: x['links'].home_page,
    axis=1
)

# check if github url in either source_code or home_page
pypi_missing["github_repo"] = pypi_missing.apply(
    lambda x: x['source_code'] if 'github.com/' in str(x['source_code']) else None,
    axis=1
)
# check home_page for github link
pypi_missing["github_repo"] = pypi_missing.apply(
    lambda x: x['home_page'] if 'github.com/' in str(x['home_page']) else x['github_repo'],
    axis=1
)

In [None]:
print(f"Found GH Repo: {pypi_missing[pypi_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {pypi_missing[pypi_missing['github_repo'].isna()].id.nunique()}")

# Packagist Finds

In [None]:
def get_soup(url):
    response = requests.get(url=url)  # request response from url
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


class PackagistProjectLinks():
    def __init__(self, package_name):
        self.package_name = package_name
        self.canonical_repo = None
        self.home_page = None
        self.source_code = None
        self.issues = None
        self.url = f"https://packagist.org/packages/{self.package_name}"

        soup = get_soup(self.url)
        table = soup.find_all("div", attrs={"class": "row package-aside"})

        for each in table:
            for row in each.find_all("a"):
                if row.get("title") == "Canonical Repository URL":
                    self.canonical_repo = row.text.strip()
                if row.text.strip() == "Homepage":
                    self.home_page = row.get("href")
                if row.text.strip() == "Source":
                    self.source_code = row.get("href")
                if row.text.strip() == "Issues":
                    self.issues = row.get("href")
                

In [None]:
packagist_missing = advisories_missing_GH_repo[advisories_missing_GH_repo['ecosystem']=="Packagist"]

packagist_missing = packagist_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

packagist_missing["links"] = packagist_missing.apply(
    lambda x: PackagistProjectLinks(package_name=x['package_name']),
    axis=1
)

# set source code
packagist_missing["source_code"] = packagist_missing.apply(
    lambda x: x['links'].source_code,
    axis=1
)

# set home page link
packagist_missing["home_page"] = packagist_missing.apply(
    lambda x: x['links'].home_page,
    axis=1
)

# set canonical_repo link
packagist_missing["canonical_repo"] = packagist_missing.apply(
    lambda x: x['links'].canonical_repo,
    axis=1
)

# check if github url in either source_code or home_page
packagist_missing["github_repo"] = packagist_missing.apply(
    lambda x: x['source_code'] if 'github.com/' in str(x['source_code']) else None,
    axis=1
)
# check home_page for github link
packagist_missing["github_repo"] = packagist_missing.apply(
    lambda x: x['home_page'] if 'github.com/' in str(x['home_page']) else x['github_repo'],
    axis=1
)

# check home_page for github link
packagist_missing["github_repo"] = packagist_missing.apply(
    lambda x: x['canonical_repo'] if 'github.com/' in str(x['canonical_repo']) else x['github_repo'],
    axis=1
)

In [None]:
print(f"Found GH Repo: {packagist_missing[packagist_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {packagist_missing[packagist_missing['github_repo'].isna()].id.nunique()}")

# RubyGems Finds

In [None]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url=url, headers=headers)  # request response from url
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


class RubyGemsProjectLinks():
    def __init__(self, package_name):
        self.package_name = package_name
        self.home_page = None
        self.source_code = None
        self.issues = None
        self.url = f"https://rubygems.org/gems/{self.package_name}"

        soup = get_soup(self.url)
        table = soup.find_all("div", attrs={"class": "gem__aside l-col--r--pad"})

        for each in table:
            for row in each.find_all("a"):
                if row.text.strip() == "Homepage":
                    self.home_page = row.get("href")
                    if 'github.com/' in str(self.home_page):
                        # some link cleaning
                        temp_home = self.home_page.split("github.com/")[-1].split('/')[:2]
                        self.home_page = f"https://github.com/{'/'.join(temp_home)}"
                        
                if row.text.strip() == "Source Code":
                    self.source_code = row.get("href")
                    if 'github.com/' in str(self.source_code):
                        # some link cleaning
                        temp_source = self.source_code.split("github.com/")[-1].split('/')[:2]
                        self.source_code = f"https://github.com/{'/'.join(temp_source)}"
                        
                if row.text.strip() == "Bug Tracker":
                    self.issues = row.get("href")
                    if 'github.com/' in str(self.issues):
                        # some link cleaning
                        temp_issues = self.issues.split("github.com/")[-1].split('/')[:2]
                        self.issues = f"https://github.com/{'/'.join(temp_issues)}"
    

In [None]:
rubygems_missing = advisories_missing_GH_repo[advisories_missing_GH_repo['ecosystem']=="RubyGems"]

rubygems_missing = rubygems_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

rubygems_missing["links"] = rubygems_missing.apply(
    lambda x: RubyGemsProjectLinks(package_name=x['package_name']),
    axis=1
)

# set source code
rubygems_missing["source_code"] = rubygems_missing.apply(
    lambda x: x['links'].source_code,
    axis=1
)

# set home page link
rubygems_missing["home_page"] = rubygems_missing.apply(
    lambda x: x['links'].home_page,
    axis=1
)

# set issues link
rubygems_missing["issues"] = rubygems_missing.apply(
    lambda x: x['links'].issues,
    axis=1
)

# check home_page for github link
rubygems_missing["github_repo"] = rubygems_missing.apply(
    lambda x: x['home_page'] if 'github.com/' in str(x['home_page']) else None,
    axis=1
)

# check home_page for github link
rubygems_missing["github_repo"] = rubygems_missing.apply(
    lambda x: x['issues'] if 'github.com/' in str(x['issues']) else x['github_repo'],
    axis=1
)

# check if github url in either source_code or home_page
rubygems_missing["github_repo"] = rubygems_missing.apply(
    lambda x: x['source_code'] if 'github.com/' in str(x['source_code']) else x['github_repo'],
    axis=1
)

In [None]:
print(f"Found GH Repo: {rubygems_missing[rubygems_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {rubygems_missing[rubygems_missing['github_repo'].isna()].id.nunique()}")

rubygems_missing.to_csv(f"./data/final_data/missing_GH_info/advisories_rubygems_20221204.csv", encoding='utf-8', index=False)

# NuGet Finds

In [3]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url=url, headers=headers)  # request response from url
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


class NuGetProjectLinks():
    def __init__(self, package_name):
        self.package_name = package_name
        self.home_page = None
        self.source_code = None
        self.url = f"https://www.nuget.org/packages/{self.package_name}"

        soup = get_soup(self.url)
        
        if "Rate limit is exceeded" in soup.text:
            print(soup.text)
            print(f"Sleeping for 60 seconds...")
            time.sleep(60)
            # get soup again
            soup = get_soup(self.url)
            
        table = soup.find_all("div", attrs={"class": "sidebar-section"})
        
        for each in table:
            for row in each.find_all("a"):
                if row.text.strip() == "Project website":
                    self.home_page = row.get("href")
                        
                if row.text.strip() == "Source repository":
                    self.source_code = row.get("href")

In [4]:
nuget_missing = advisories_missing_GH_repo[advisories_missing_GH_repo['ecosystem']=="NuGet"]

nuget_missing = nuget_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

nuget_missing["links"] = nuget_missing.apply(
    lambda x: NuGetProjectLinks(package_name=x['package_name']),
    axis=1
)

# set source code
nuget_missing["source_code"] = nuget_missing.apply(
    lambda x: x['links'].source_code,
    axis=1
)

# set home page link
nuget_missing["home_page"] = nuget_missing.apply(
    lambda x: x['links'].home_page,
    axis=1
)

# check home_page for github link
nuget_missing["github_repo"] = nuget_missing.apply(
    lambda x: x['home_page'] if 'github.com/' in str(x['home_page']) else None,
    axis=1
)

# check if github url in either source_code or home_page
nuget_missing["github_repo"] = nuget_missing.apply(
    lambda x: x['source_code'] if 'github.com/' in str(x['source_code']) else x['github_repo'],
    axis=1
)

{ "statusCode": 429, "message": "Rate limit is exceeded. Try again in 31 seconds." }
Sleeping for 60 seconds...
{ "statusCode": 429, "message": "Rate limit is exceeded. Try again in 34 seconds." }
Sleeping for 60 seconds...


In [5]:
print(f"Found GH Repo: {nuget_missing[nuget_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {nuget_missing[nuget_missing['github_repo'].isna()].id.nunique()}")

nuget_missing.to_csv(f"./data/final_data/missing_GH_info/advisories_nuget_20221204.csv", encoding='utf-8', index=False)

Found GH Repo: 90
No GH Repo: 21


# Go Finds

In [10]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url=url, headers=headers)  # request response from url
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


class GoProjectLinks():
    def __init__(self, package_name):
        self.package_name = package_name
        self.source_code = None
        self.url = f"https://pkg.go.dev/{self.package_name}"

        soup = get_soup(self.url)
            
        table = soup.find_all("div", attrs={"class": "UnitMeta-repo"})
        
        for each in table:
            for row in each.find_all("a"):
                self.source_code = row.get("href")

In [16]:
go_missing = advisories_missing_GH_repo[advisories_missing_GH_repo['ecosystem']=="Go"]

go_missing = go_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

go_missing["links"] = go_missing.apply(
    lambda x: GoProjectLinks(package_name=x['package_name']),
    axis=1
)

# set source code
go_missing["source_code"] = go_missing.apply(
    lambda x: x['links'].source_code,
    axis=1
)

# check if github url in either source_code or home_page
go_missing["github_repo"] = go_missing.apply(
    lambda x: x['source_code'] if 'github.com/' in str(x['source_code']) else None,
    axis=1
)

In [17]:
print(f"Found GH Repo: {go_missing[go_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {go_missing[go_missing['github_repo'].isna()].id.nunique()}")

go_missing.to_csv(f"./data/final_data/missing_GH_info/advisories_go_20221204.csv", encoding='utf-8', index=False)

Found GH Repo: 106
No GH Repo: 5


# Crates.IO Finds

In [23]:
class CratesProjectLinks():
    def __init__(self, package_name):
        self.package_name = package_name
        self.source_code = None
        
        # we can use a direct API here
        self.url = f"https://crates.io/api/v1/crates/{self.package_name}"

        response = requests.get(url=self.url)
        
        if response.status_code == 200:
            self.source_code = response.json()["crate"]["repository"] 

In [24]:
crates_missing = advisories_missing_GH_repo[advisories_missing_GH_repo['ecosystem']=="crates.io"]

crates_missing = crates_missing[['id', 'package_name']].drop_duplicates().reset_index(drop=True)

crates_missing["links"] = crates_missing.apply(
    lambda x: CratesProjectLinks(package_name=x['package_name']),
    axis=1
)

# set source code
crates_missing["source_code"] = crates_missing.apply(
    lambda x: x['links'].source_code,
    axis=1
)

# check if github url in either source_code or home_page
crates_missing["github_repo"] = crates_missing.apply(
    lambda x: x['source_code'] if 'github.com/' in str(x['source_code']) else None,
    axis=1
)

In [25]:
print(f"Found GH Repo: {crates_missing[crates_missing['github_repo'].notna()].id.nunique()}")
print(f"No GH Repo: {crates_missing[crates_missing['github_repo'].isna()].id.nunique()}")

crates_missing.to_csv(f"./data/final_data/missing_GH_info/advisories_crates_20221204.csv", encoding='utf-8', index=False)

Found GH Repo: 17
No GH Repo: 9
