# Measure Code Duplication

In [1]:
from benchmark.environment import IMPROVEMENTS, STATES
from benchmark.process import run
from requests import get, post
from os import chdir
from pathlib import Path
import pandas as pd
import docker
import git


SONAR_PORT = 9999
SONAR_PASSWORD = "password"


if not "PROJECT_DIR" in globals():
    PROJECT_DIR = Path().resolve().parent
    chdir(PROJECT_DIR)


if not "OUT_DIR" in globals():
    OUT_DIR = PROJECT_DIR / "code-duplication" / "out"
    OUT_DIR.mkdir(parents=True, exist_ok=True)


if not "REPO_DIR" in globals():
    REPO_DIR = PROJECT_DIR / "fda-services"

In order to analyze the code quality of DBRepo, we make use of SonarQube running in a Docker container. The following command can be used to start an instance of SonarQube:

In [2]:
docker_client = docker.from_env()

In [3]:
docker_client.containers.run(
    "sonarqube@sha256:72e9feec71242af83faf65f95a40d5e3bb2822a6c3b2cda8568790f3d31aecde",
    detach=True,
    name="sonarqube",
    environment={"SONAR_ES_BOOTSTRAP_CHECKS_DISABLE": "true"},
    ports={"9000": SONAR_PORT},
)

<Container: 74e8a5c9e831>

To analyze DBRepo's code, we clone the repository and initialize an instance of the repo.

In [4]:
if not REPO_DIR.exists():
    git.Git(PROJECT_DIR).clone(
        "https://gitlab.phaidra.org/fair-data-austria-db-repository/fda-services.git"
    )

repo = git.Repo(REPO_DIR)

After the container has successfully started, the repo is cloned and the web interface is accessible at [http://localhost:9999](), we first have to change the password of the default admin user.

In [5]:
response = post(
    f"http://localhost:{SONAR_PORT}/api/users/change_password",
    auth=("admin", "admin"),
    data={"login": "admin", "password": SONAR_PASSWORD, "previousPassword": "admin"},
)
response.raise_for_status()

Next, we generate a token for the admin user to be able to analyze the code quality of DBRepo and store it in an environment variable.

In [6]:
response = post(
    f"http://localhost:{SONAR_PORT}/api/user_tokens/generate",
    auth=("admin", SONAR_PASSWORD),
    data={
        "login": "admin",
        "name": "Global Analysis Token",
        "type": "GLOBAL_ANALYSIS_TOKEN",
    },
)
response.raise_for_status()
sonar_token = response.json()["token"]

Then, we create a branch of the state of the repository before and after each improvement based on the commit hashes specified in [STATES.md](../STATES.md).

In [7]:
repo.git.branch(
    "environment-independence_before", "8bb23619997f1d6f85d85718eb5eb018f68bd80f"
)
repo.git.branch(
    "environment-independence_after", "7cf0c76094c285a02ad3341685969733d6836164"
)
repo.git.branch("service-merge_before", "82cd375098246e38cf1da9ad34ee981a637433b7")
repo.git.branch("service-merge_after", "683d2096bf20fe2c9703aab181c932e17550c3e7")

''

Now we're ready to start the analysis. The following script can be used to analyze the code quality of DBRepo before and after each improvement. We create a project for each improvement and state because of limitations of the SonarQube community edition.

In [None]:
for improvement in IMPROVEMENTS:
    for state in STATES:
        key = f"{improvement}_{state}"

        response = post(
            f"http://localhost:{SONAR_PORT}/api/projects/create",
            auth=("admin", SONAR_PASSWORD),
            data={
                "creationMode": "manual",
                "monorepo": "false",
                "project": key,
                "name": key,
                "mainBranch": "main",
            },
        )
        response.raise_for_status()

        repo.git.checkout(key)

        run("rm -rf ./*/*/target/", cwd=REPO_DIR, shell=True)
        # run('make build-backend', cwd=REPO_DIR, shell=True)
        run(
            'eval "$(pyenv init -)" && make build-backend', cwd=REPO_DIR, shell=True
        )  # activating pyenv is necessary in my environment, remove if not needed in yours

        docker_client.containers.run(
            "sonarsource/sonar-scanner-cli@sha256:605ea9a44a12ec328ad59a9b98c740cf0672a467b57ba2ae63cb85cc5831287e",
            remove=True,
            network_mode="host",
            volumes={REPO_DIR: {"bind": "/usr/src"}},
            command=[
                "-Dsonar.projectKey=" + key,
                "-Dsonar.projectName=" + key,
                "-Dsonar.host.url=http://localhost:" + str(SONAR_PORT),
                "-Dsonar.token=" + sonar_token,
                "-Dsonar.java.binaries="
                + ",".join(
                    [
                        str(path.relative_to(REPO_DIR))
                        for path in REPO_DIR.glob("*/*/target/classes/")
                    ]
                ),
                "-Dsonar.java.libraries=**/*.jar",
            ],
        )

After the analysis has finished, we can extract information about the code replication from the projects and store them in CSV files.

In [15]:
overall_data = []

for improvement in IMPROVEMENTS:
    for state in STATES:
        key = f"{improvement}_{state}"

        response = get(
            f"http://localhost:{SONAR_PORT}/api/measures/component_tree",
            auth=("admin", SONAR_PASSWORD),
            params={
                "component": key,
                "s": "qualifier,name",
                "metricKeys": "ncloc,duplicated_lines,duplicated_lines_density",
                "strategy": "children",
            },
        )
        response.raise_for_status()
        json = response.json()

        overall_metrics = {
            measure["metric"]: measure["value"]
            for measure in json["baseComponent"]["measures"]
        }
        overall_data.append(
            {
                "improvement": improvement,
                "state": state,
                "lines": overall_metrics["ncloc"],
                "duplicated": overall_metrics["duplicated_lines"],
                "duplicated-density": overall_metrics["duplicated_lines_density"],
            }
        )

        services = []

        for component in json["components"]:
            if component["qualifier"] != "DIR":
                print(component["qualifier"])
                continue

            service = {"service": component["name"]}
            service.update(
                {
                    measure["metric"]: measure["value"]
                    for measure in component["measures"]
                }
            )
            services.append(service)

        service_dataframe = pd.DataFrame(services)
        service_dataframe["improvement"] = improvement
        service_dataframe["state"] = state
        service_dataframe.rename(
            columns={
                "name": "service",
                "ncloc": "lines",
                "duplicated_lines": "duplicated",
                "duplicated_lines_density": "duplicated-density",
            },
            inplace=True,
        )
        service_dataframe = service_dataframe[
            [
                "improvement",
                "state",
                "service",
                "lines",
                "duplicated",
                "duplicated-density",
            ]
        ]
        service_dataframe.to_csv(
            OUT_DIR / f"service-duplication_{improvement}_{state}.csv", index=False
        )

overall = pd.DataFrame(overall_data)
overall = overall[["improvement", "state", "lines", "duplicated", "duplicated-density"]]
overall.to_csv(OUT_DIR / "overall-duplication.csv", index=False)

# Results

We can export the results of the code replication analysis previously stored in CSV files as latex tables, to include them in the bachelor's thesis.

In [None]:
df = pd.read_csv(OUT_DIR / "overall-duplication.csv")
df.columns = df.columns.str.replace("-", " ").str.title()
df["Improvement"] = df["Improvement"].str.replace("-", " ").str.title()
df["State"] = df["State"].str.title()
df.set_index(["Improvement", "State"], inplace=True)
print(df.to_latex())

\begin{tabular}{llrrr}
\toprule
 &  & Lines & Duplicated & Duplicated Density \\
Improvement & State &  &  &  \\
\midrule
\multirow[t]{2}{*}{Environment Independence} & Before & 60417 & 3977 & 5.300000 \\
 & After & 59232 & 4300 & 5.900000 \\
\cline{1-5}
\multirow[t]{2}{*}{Service Merge} & Before & 71913 & 6135 & 7.000000 \\
 & After & 57515 & 2245 & 3.200000 \\
\cline{1-5}
\bottomrule
\end{tabular}



In [None]:
for improvement in IMPROVEMENTS:
    for state in STATES:
        df = pd.read_csv(OUT_DIR / f"service-duplication_{improvement}_{state}.csv")
        df.columns = df.columns.str.replace("-", " ").str.title()
        df = df.drop(columns=["Improvement", "State"])
        df["Service"] = (
            df["Service"]
            .str.replace("dbrepo-", "")
            .str.replace("-", " ")
            .str.replace("db", "database")
            .str.title()
            .replace("Ui", "UI Service")
        )
        print(df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
Service & Lines & Duplicated & Duplicated Density \\
\midrule
Analyse Service & 550 & 0 & 0.000000 \\
Authentication Service & 30 & 0 & 0.000000 \\
Broker Service & 14 & 0 & 0.000000 \\
Container Service & 3710 & 269 & 5.600000 \\
Database Service & 5810 & 504 & 6.600000 \\
Identifier Service & 4517 & 389 & 6.700000 \\
Metadata Database & 11212 & 604 & 3.900000 \\
Metadata Service & 1513 & 178 & 9.600000 \\
Query Service & 9527 & 872 & 7.200000 \\
Semantics Service & 3279 & 299 & 7.300000 \\
Table Service & 4530 & 547 & 9.500000 \\
UI Service & 12343 & 141 & 1.100000 \\
User Service & 3382 & 174 & 4.000000 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrr}
\toprule
Service & Lines & Duplicated & Duplicated Density \\
\midrule
Analyse Service & 550 & 0 & 0.000000 \\
Authentication Service & 30 & 0 & 0.000000 \\
Broker Service & 14 & 0 & 0.000000 \\
Container Service & 2964 & 269 & 7.000000 \\
Database Service & 5728 & 689 & 9.200000 \\
Identifier Service 