### Get Vulnerabilities from OSV

In [2]:
import os
import glob
import json
import collections.abc
import pandas as pd
import sys
sys.path.append('../utils')
import database as db

VUL_PATH = "vuls.json"
SFP_PATH = "sfp.csv"

In [6]:
# transform all vulnerabilities into a single json file
def data_transform(in_path, out_path):
    files = glob.iglob(in_path+"/*")
    result = list()
    for file in files:
        with open(file, 'r') as infile:
            result.append(json.load(infile))
    with open(out_path, 'w') as output_file:
        json.dump(result, output_file, indent=4)

if not os.path.exists(VUL_PATH):
    os.system("curl https://osv-vulnerabilities.storage.googleapis.com/crates.io/all.zip -o all.zip")
    os.system("unzip all.zip -d rust_vuls")
    data_transform("./rust_vuls", VUL_PATH)
    os.system("rm all.zip")
    os.system("rm -r rust_vuls")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2261k  100 2261k    0     0  2049k      0  0:00:01  0:00:01 --:--:-- 2050k


Archive:  all.zip
  inflating: rust_vuls/GHSA-2226-4v3c-cff8.json  
  inflating: rust_vuls/GHSA-22q8-ghmq-63vf.json  
  inflating: rust_vuls/GHSA-2326-pfpj-vx3h.json  
  inflating: rust_vuls/GHSA-2367-c296-3mp2.json  
  inflating: rust_vuls/GHSA-23rx-c3g5-hv9w.json  
  inflating: rust_vuls/GHSA-24g6-5rx7-58wj.json  
  inflating: rust_vuls/GHSA-253q-9q78-63x4.json  
  inflating: rust_vuls/GHSA-255r-3prx-mf99.json  
  inflating: rust_vuls/GHSA-25mx-8f3v-8wh7.json  
  inflating: rust_vuls/GHSA-275g-g844-73jh.json  
  inflating: rust_vuls/GHSA-27vq-hv74-7cqp.json  
  inflating: rust_vuls/GHSA-27wg-99g8-2v4v.json  
  inflating: rust_vuls/GHSA-286m-6pg9-v42v.json  
  inflating: rust_vuls/GHSA-287x-9rff-qvcg.json  
  inflating: rust_vuls/GHSA-28m8-9j7v-x499.json  
  inflating: rust_vuls/GHSA-28p5-7rg4-8v99.json  
  inflating: rust_vuls/GHSA-28ph-f7gx-fqj8.json  
  inflating: rust_vuls/GHSA-28r9-pq4c-wp3c.json  
  inflating: rust_vuls/GHSA-2969-8hh9-57jc.json  
  inflating: rust_vuls/GHSA-29c2

### Get SFP mappings from cwe.mitre.org

In [5]:
if not os.path.exists(SFP_PATH):
    os.system("curl https://cwe.mitre.org/data/csv/888.csv.zip -o sfp.zip")
    os.system("unzip -q -p sfp.zip > sfp.csv")
    os.system("rm sfp.zip")

### Get  metadata of Rust packages from crates.io

In [None]:
# Download https://static.crates.io/db-dump.tar.gz
# expect ~1 GB download
if not os.path.exists("crates_io_db"):
    os.system("wget https://static.crates.io/db-dump.tar.gz")
    os.system("tar -xzf db-dump.tar.gz")
    os.system("rm db-dump.tar.gz")
    # and extract its only directory as create_io_db/
    # usually this directory is named as the timestamp of download
    os.system("mv 20*/ crates_io_db")

df_crates = pd.read_csv("crates_io_db/data/crates.csv")
df_crates.to_sql(name="crates", con=db.conn, if_exists="replace", index=False)

df_categories = pd.read_csv("crates_io_db/data/categories.csv")
df_categories.to_sql(name="categories", con=db.conn, if_exists="replace", index=False)

df_crates_categories = pd.read_csv("crates_io_db/data/crates_categories.csv")
df_crates_categories.to_sql(name="crates_categories", con=db.conn, if_exists="replace", index=False)

df_versions = pd.read_csv("crates_io_db/data/versions.csv")
df_versions.to_sql(name="versions", con=db.conn, if_exists="replace", index=False)


[0] Downloading 'https://static.crates.io/db-dump.tar.gz' ...
HTTP response 307  [https://static.crates.io/db-dump.tar.gz]
Adding URL: https://cloudfront-static.crates.io/db-dump.tar.gz
[0] Downloading 'https://cloudfront-static.crates.io/db-dump.tar.gz' ...
Saving 'db-dump.tar.gz'
HTTP response 200  [https://cloudfront-static.crates.io/db-dump.tar.gz]


1960530

### Format the vulnerabilities and merge vulnerability info

In [9]:
df_cve = pd.read_json(VUL_PATH)
df_sfp = pd.read_csv(SFP_PATH)
df_crates = pd.read_sql("SELECT name, repository FROM crates", con=db.conn)

In [10]:
def get_sfps_from_cwes():
    taxonomy = df_sfp["Affected Resources"].apply(lambda x: None if type(x)==float else (x.split("::TAXONOMY NAME:")))
    primary_cluster = {"SFP1":"Risky Values", "SFP2":"Unused Entities", "SFP3":"API", "SFP4":"Exception Management", "SFP5":"Exception Management", "SFP6":"Exception Management", \
                        "SFP7":"Memory Access", "SFP8":"Memory Access", "SFP9":"Memory Access", "SFP10":"Memory Access", "SFP11":"Memory Access", \
                        "SFP12":"Memory Management", "SFP13":"Resource Management", "SFP14":"Resource Management", "SFP15":"Resource Management", \
                        "SFP16":"Path Resolution", "SFP17":"Path Resolution", "SFP18":"Path Resolution", \
                        "SFP19":"Synchronization", "SFP20":"Synchronization", "SFP21":"Synchronization", "SFP22":"Synchronization",\
                        "SFP23":"Information Leak", "SFP24":"Tainted Input", "SFP25":"Tainted Input", "SFP26":"Tainted Input", "SFP27":"Tainted Input", \
                        "SFP28":"Entry Points", "SFP29":"Authentication", "SFP30":"Authentication", "SFP31":"Authentication", "SFP32":"Authentication", "SFP33":"Authentication", "SFP34":"Authentication",\
                        "SFP35":"Access Control", "SFP36":"Privilege", "SFP37":"Faulty Resource Release", "SFP38":"Failure to Release Memory"}
    def getSFP(x):
        if x is None:
            return x
        for t in x:
            if 'Software Fault Patterns' in t:
                sfp = t.split(':')[2]
                return primary_cluster[sfp] if sfp and 'SFP' in sfp else None
        return None
    taxonomy = taxonomy.apply(lambda x: getSFP(x) )
    # add expections
    taxonomy[908] = taxonomy[909] = "Exception Management"
    taxonomy[131] = taxonomy[787] = taxonomy[824]= taxonomy[119]= taxonomy[125] = "Memory Access"
    taxonomy[758] = "API"
    taxonomy[843] = "Risky Values"
    taxonomy[770] = taxonomy[772] = taxonomy[789] = "Resource Management"
    taxonomy[706] = "Path Resolution"
    taxonomy[362] = "Synchronization"
    taxonomy[668] = taxonomy[200] = taxonomy[203]= taxonomy[208]= taxonomy[377] = "Information Leak"
    taxonomy[129] = taxonomy[427] = taxonomy[172]= taxonomy[444]= taxonomy[198] = taxonomy[94] = taxonomy[351] = "Tainted Input"
    taxonomy[295] = "Authentication"
    taxonomy[279] = "Access Control"
    taxonomy[269] = "Privilege"
    taxonomy[327] = taxonomy[1240] = taxonomy[347] = "Cryptography"
    taxonomy[330] = taxonomy[338] = taxonomy[340] = "Predictability"
    taxonomy[657] = taxonomy[670] = taxonomy[682]= taxonomy[697]= taxonomy[188] = taxonomy[193] = taxonomy[835] = "Other"

    return taxonomy

In [11]:
from cvss import CVSS3, CVSS4

def cvss_version_autodetect(v):
    if not v:
        return None
    elif type(v) is str and str.lower(v) in ['low', 'medium', 'high', 'critical', 'moderate']:
        if str.lower(v) == 'moderate':
            return 'Medium'
        else:
            return str.capitalize(v)
    elif v.startswith('CVSS:3'):
        return CVSS3(v).severities()[0]
    elif v.startswith('CVSS:4'):
        return CVSS4(v).severities()[0]
    else:
        raise TypeError()

def get_vul_severity(data):
    temp = data.apply(lambda x:
        (
            x['affected'][0]['database_specific']['cvss']
            if 'cvss' in x['affected'][0]['database_specific']
            else (
                x['database_specific']['severity']
                if 'severity' in x['database_specific']
                else None
            )
        )
        if type(x['severity'])==float
        else x['severity'][0]['score']
        , axis=1)
    return temp.apply(cvss_version_autodetect)
    
def get_vul_package(data):
    return pd.Series(list(d[0]['package']['name'] for d in data['affected']))

def get_vul_repo_url(data):
    return data['package'].apply(lambda x: df_crates['repository'][df_crates['name']==x].values[0] if len(df_crates['repository'][df_crates['name']==x].values)!=0 else None)

def get_vul_sfp_id(data):
    temp = data.apply(lambda x:
        x['database_specific']['cwe_ids']
        if 'cwe_ids' in x.get('database_specific', {})
        else
            x['affected'][0]['database_specific'].get('categories', [])
        , axis=1)
    taxonomy = get_sfps_from_cwes()
    res = ""
    for t in taxonomy[taxonomy=='Failure to Release Memory'].index:
        res = res + "CWE-"+str(t)+", "
    print(res)
    no_cat = 0
    cwe = 0
    category = 0

    for t in temp:
        if len(t)==0:
            no_cat= no_cat+1
        elif 'CWE' in t[0]:
            cwe = cwe+1
        else:
            category = category+1

    def get_sfp_from_cwe(x):
        cat2sfp = {"memory-exposure":"Memory Access", "memory-corruption":"Memory Management", "denial-of-service":"Resource Management", "file-disclosure":"Path Resolution",\
                    "thread-safety":"Synchronization", "format-injection":"Tainted Input", "privilege-escalation":"Privilege", "crypto-failure":"Cryptography", "code-execution":"Other"
                  }
        res = list()
        for cwe in x:
            if "CWE" in cwe:
                id = int(cwe.split('-')[1])
                if id in taxonomy.index and taxonomy[id]:
                    res.append(str(taxonomy[id])) 
            else:
                res.append(str(cat2sfp[cwe]))
        return res
    return temp.apply(lambda x: str(get_sfp_from_cwe(x)))

def get_vul_version(data):
    def get_version(affects):
        versions = list()
        for affect in affects:
            if "ranges" in affect:
                introduced = affect["ranges"][0]["events"][0]["introduced"] if "introduced" in affect["ranges"][0]["events"][0] else None
                fixed = affect["ranges"][0]["events"][1]["fixed"] if len(affect["ranges"][0]["events"])>1 and "fixed" in affect["ranges"][0]["events"][1] else None
                versions.append((introduced, fixed))
        return str(versions)

    return data["affected"].apply(lambda x:get_version(x) )

def get_vul_reference(data):
    references_list = data.pop('references')
    return pd.Series(
        str(list(
            (
                ref['url']
                for ref in d
            )
            if isinstance(d, collections.abc.Iterable)
            else []))
        for d in references_list)


In [12]:
# execute this cell only once per session
ordered_cve_columns = ['id', 'package', 'repo_url', 'sfp_id', 'modified', 'published', 'vul_version', 'summary', 'details', 'severity', 'references']
df_cve["severity"] = get_vul_severity(df_cve)
df_cve['package'] = get_vul_package(df_cve)
df_cve['repo_url'] = get_vul_repo_url(df_cve)
df_cve['sfp_id'] = get_vul_sfp_id(df_cve)
df_cve['vul_version'] = get_vul_version(df_cve)
df_cve['references'] = get_vul_reference(df_cve)
df_cve.drop(['aliases', 'database_specific', 'affected', 'schema_version', 'related'], axis=1, inplace=True)
df_cve = df_cve[ordered_cve_columns]
print(df_cve.head())
df_cve = df_cve.applymap(str)
df_cve.to_sql('cve_dup', db.conn, if_exists='replace', index=False)


                    id        package  \
0    RUSTSEC-2023-0048       intaglio   
1  GHSA-xqxc-x6p3-w683           deno   
2  GHSA-fmj9-77q8-g6c4  apollo-router   
3  GHSA-v3j6-xf77-8r9c     actix-http   
4  GHSA-mmjf-f5jw-w72q    openssl-src   

                                         repo_url                   sfp_id  \
0           https://github.com/artichoke/intaglio                       []   
1                https://github.com/denoland/deno                       []   
2        https://github.com/apollographql/router/  ['Resource Management']   
3              https://github.com/actix/actix-web  ['Resource Management']   
4  https://github.com/alexcrichton/openssl-src-rs                ['Other']   

                          modified             published  \
0 2023-11-08 04:19:27.824697+00:00  2023-07-26T12:00:00Z   
1        2025-07-02 18:29:57+00:00  2025-06-04T21:13:44Z   
2        2024-09-13 13:35:59+00:00  2024-08-27T18:14:12Z   
3 2023-11-08 04:03:38.366013+00:00  2021-08

  df_cve = df_cve.applymap(str)


2020

### Remove duplicate vulnerabilities

In [13]:
import pandas as pd
import ast

df_res = list()
df_master = df_cve
# df_master = pd.read_csv("cve.csv")
packages = df_master.groupby('package').groups
print(len(df_master))
cnt = 0
for key, values in packages.items():
    # find duplicate vulnerability
    df_tmp = df_master.iloc[values]
    # get vulnerability related references
    ref = ast.literal_eval(df_tmp.iloc[0]['references'])
    ref = list(filter(lambda tmp: "nvd" in tmp or "rustsec" in tmp, ref))
    references = [ref]
    vuls = [[0]]
    for i in range(1, len(values)):
        flag = True
        tmp = ast.literal_eval(df_tmp.iloc[i]['references'])
        tmp = list(filter(lambda ref: "nvd" in ref or "rustsec" in ref, tmp))
        for idx, ref in enumerate(references):
            # same vulnerability or not
            if set(ref) & set(tmp):
                references[idx].extend(tmp)
                vuls[idx].append(i)
                flag = False
        if flag:
            vuls.append([i])
            references.append(tmp)

    # merge duplicate data and append it to new dataframe
    for vul_idx in vuls:
        # print(df_tmp.iloc[vul_idx])
        vul = {
            'id': [],
            'package': df_tmp.iloc[vul_idx[0]]['package'],
            'repo_url': df_tmp.iloc[vul_idx[0]]['repo_url'],
            'sfp_id': [],
            'modified': df_tmp.iloc[vul_idx[0]]['modified'],
            'published': df_tmp.iloc[vul_idx[0]]['published'],
            'vul_version': [],
            'summary': "",
            'details': "",
            'severity': "",
            'references': []
        }
        for vvul in vul_idx:
            vul['id'].append(df_tmp.iloc[vvul]['id'])
            vul['published'] = min(vul['published'], df_tmp.iloc[vvul]['published'])
            vul['sfp_id'].extend(ast.literal_eval(df_tmp.iloc[vvul]['sfp_id']))
            vul['references'].extend(ast.literal_eval(df_tmp.iloc[vvul]['references']))
            vul['vul_version'].extend(ast.literal_eval(df_tmp.iloc[vvul]['vul_version']))
            vul['sfp_id'] = list(set(vul['sfp_id']))
            vul['references'] = list(set(vul['references']))
            vul['vul_version'] = list(set(vul['vul_version']))

            vul['summary'] = df_tmp.iloc[vvul]['summary'] + '\n' + vul['summary']
            vul['details'] = df_tmp.iloc[vvul]['details'] + '\n' + vul['details']
            if df_tmp.iloc[vvul]['severity'] != 'nan':
                vul['severity'] = df_tmp.iloc[vvul]['severity']
        
        df_res.append(vul)
        
df_res = pd.DataFrame(df_res)
print(len(df_res))

2020
1236


In [14]:
# remove vulnerabilities that report unmaintained projects
df_res = df_res[~df_res['summary'].str.contains("unmaint", case=False)]
print(len(df_res))

1075


### Dump vulnerabilities to database

In [15]:
df_res = df_res.map(str)
df_res.to_sql(name="cve", con=db.conn, if_exists="replace", index=False)

1075