In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read JSON Files") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [None]:
df = spark.read.parquet("../.viper_cache/vulnerabilities.parquet")

pdf = df.pandas_api()
pdf.head()

In [None]:
package = "transformers"
version = "4.37.2"

df.filter(
    (F.col("package_name") == F.lit(package)) &
    (F.col("versions") == F.lit(version))
).select("severity_score", "severity", "summary").take(1)

In [None]:
package = "gradio"

df.filter(
    (F.col("package_name") == F.lit(package))
).select("severity_score", "severity", "summary").take(1)

In [None]:
from pyspark.sql.types import StructType, StructField, StringType


# List of package strings
packages = [
    "anyio==4.4.0",
    "asttokens==2.4.1",
    "certifi==2024.7.4",
    "charset-normalizer==3.3.2",
    "comm==0.2.2",
    "coverage==7.5.4",
    "cvss==3.1",
    "debugpy==1.8.2",
    "decorator==5.1.1",
    "duckdb==1.0.0",
    "einops==0.8.0",
    "executing==2.0.1",
    "filelock==3.15.4",
    "flash-attn==2.5.9.post1",
    "fsspec==2024.6.1",
    "h11==0.14.0",
    "httpcore==1.0.5",
    "httpx==0.27.0",
    "huggingface-hub==0.23.4",
    "idna==3.7",
    "iniconfig==2.0.0",
    "ipykernel==6.29.5",
    "ipython==8.26.0",
    "jedi==0.19.1",
    "Jinja2==3.1.4",
    "jupyter_client==8.6.2",
    "jupyter_core==5.7.2",
    "MarkupSafe==2.1.5",
    "matplotlib-inline==0.1.7",
    "mpmath==1.3.0",
    "nest-asyncio==1.6.0",
    "networkx==3.3",
    "numpy==1.26.4",
    "nvidia-cublas-cu12==12.1.3.1",
    "nvidia-cuda-cupti-cu12==12.1.105",
    "nvidia-cuda-nvrtc-cu12==12.1.105",
    "nvidia-cuda-runtime-cu12==12.1.105",
    "nvidia-cudnn-cu12==8.9.2.26",
    "nvidia-cufft-cu12==11.0.2.54",
    "nvidia-curand-cu12==10.3.2.106",
    "nvidia-cusolver-cu12==11.4.5.107",
    "nvidia-cusparse-cu12==12.1.0.106",
    "nvidia-nccl-cu12==2.20.5",
    "nvidia-nvjitlink-cu12==12.5.82",
    "nvidia-nvtx-cu12==12.1.105",
    "packaging==24.1",
    "pandas==2.2.2",
    "parso==0.8.4",
    "pexpect==4.9.0",
    "platformdirs==4.2.2",
    "pluggy==1.5.0",
    "prompt_toolkit==3.0.47",
    "psutil==6.0.0",
    "ptyprocess==0.7.0",
    "pure-eval==0.2.2",
    "py4j==0.10.9.7",
    "pyarrow==16.1.0",
    "Pygments==2.18.0",
    "pyspark==3.5.1",
    "pytest==8.2.2",
    "pytest-cov==5.0.0",
    "python-dateutil==2.9.0.post0",
    "pytz==2024.1",
    "PyYAML==6.0.1",
    "pyzmq==26.0.3",
    "regex==2024.5.15",
    "requests==2.32.3",
    "ruff==0.5.0",
    "safetensors==0.4.3",
    "six==1.16.0",
    "sniffio==1.3.1",
    "stack-data==0.6.3",
    "sympy==1.12.1",
    "tokenizers==0.15.2",
    "torch==2.3.1",
    "tornado==6.4.1",
    "tqdm==4.66.4",
    "traitlets==5.14.3",
    "transformers==4.36.0",
    "triton==2.3.1",
    "typing_extensions==4.12.2",
    "tzdata==2024.1",
    "urllib3==2.2.2"
]


# Convert list of package strings to list of tuples (package_name, version)
packages_tuples = [(pkg.split("==")[0], pkg.split("==")[1]) for pkg in packages]

# Define the schema

schema = StructType([
    StructField("package_name", StringType(), True),
    StructField("versions", StringType(), True)
])

# Create DataFrame
packages_df = spark.createDataFrame(packages_tuples, schema)

# Show DataFrame
packages_df.show()


In [None]:
# Assuming packages_df is the DataFrame created from the list of packages
joined_df = df.join(
    packages_df,
    on = ["package_name", "versions"],
    how="inner")

# Show the result
joined_df.show()
