## Read Respository

In [7]:
from pathlib import Path
import pandas as pd
from datetime import datetime
import time

# verify with shell: find . | grep -E 'jar$' | wc -l
def list_artifact(starting_directory : str, glob="*") -> list[str]:
    poms = []
    path_object = Path(starting_directory)
    for file_path in path_object.rglob(glob):
        if file_path.is_file():
            stat = file_path.stat()
            p = file_path.as_posix()
            if p.find('-sources.jar') != -1: # skip sources
                continue
            if p.find('SNAPSHOT') != -1 and not p.endswith('-SNAPSHOT.jar'): # skip sources
                continue
            artifacts = p.removeprefix(starting_directory).split('/')
            # print(artifacts)
            poms.append({
                # 'folder': file_path.parent.as_posix(),
                # 'path': artifacts,
                'group': '.'.join(artifacts[:-3]),
                'artifact':  artifacts[-3],
                'version': artifacts[-2],
                'size': stat.st_size, # / 1024 / 1024, # MB
                'ctime': datetime.fromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S.%f'),
                'jar': artifacts[-1]
            })
    return poms

artifact_home = "C:/Users/zhouj/.m2/repository/"
artifacts = list_artifact(artifact_home, "*.jar")
# pd.json_normalize(artifacts)
print(len(artifacts))
s = pd.json_normalize(artifacts)
# s.sort_values(by=['size', 'group', 'artifact'], ascending=False)
# s['path.size'].sum()

s.sort_values(by=['ctime'], ascending=True)

2592


Unnamed: 0,group,artifact,version,size,ctime,jar
1334,org.apache.maven.plugins,maven-clean-plugin,3.2.0,35678,2024-03-07 12:43:26.969265,maven-clean-plugin-3.2.0.jar
1416,org.apache.maven.shared,maven-shared-utils,3.3.4,153143,2024-03-07 12:43:36.989123,maven-shared-utils-3.3.4.jar
500,commons-io,commons-io,2.6,214788,2024-03-07 12:43:42.467580,commons-io-2.6.jar
1355,org.apache.maven.plugins,maven-resources-plugin,3.3.0,31760,2024-03-07 12:44:03.833816,maven-resources-plugin-3.3.0.jar
1336,org.apache.maven.plugins,maven-compiler-plugin,3.10.1,61899,2024-03-07 12:44:05.852205,maven-compiler-plugin-3.10.1.jar
...,...,...,...,...,...,...
2163,org.junit.platform,junit-platform-launcher,1.3.1,94645,2024-08-06 09:08:39.365935,junit-platform-launcher-1.3.1.jar
1445,org.apache.maven.surefire,surefire-junit-platform,2.22.2,65915,2024-08-06 09:08:39.866519,surefire-junit-platform-2.22.2.jar
1611,org.apiguardian,apiguardian-api,1.0.0,2164,2024-08-06 09:08:40.663393,apiguardian-api-1.0.0.jar
2234,org.opentest4j,opentest4j,1.1.1,7121,2024-08-06 09:08:49.388928,opentest4j-1.1.1.jar


In [3]:
# https://pypi.org/project/ipython-sql/

# install:
# !pip install ipython-sql

In [4]:
%load_ext sql
# %reload_ext sql

In [5]:
%sql sqlite:///data/maven.db

In [8]:
values = []
for i in range(len(artifacts)):
    values.append("('{group}', '{artifact}', '{version}', '{jar}', {size}, '{ctime}')".format(
        group=artifacts[i]['group'],
        artifact=artifacts[i]['artifact'],
        version=artifacts[i]['version'],
        jar=artifacts[i]['jar'],
        size=artifacts[i]['size'],
        ctime=artifacts[i]['ctime']
    ))
sql_values = ",".join(values)

In [13]:
%%sql
CREATE TABLE IF NOT EXISTS `artifacts` (
    `group` varchar(128) NOT NULL,
    `artifact` varchar(128) NOT NULL,
    `version` varchar(128) NOT NULL,
    `jar` varchar(128)  NOT NULL,
    `size` INT NOT NULL,
    `ctime` DATETIME NOT NULL);
DELETE FROM `artifacts`;
INSERT INTO `artifacts`(`group`,`artifact`,`version`,`jar`, `size`,`ctime`)
VALUES
{sql_values}
;

 * sqlite:///data/maven.db
Done.
2592 rows affected.
2592 rows affected.


[]

In [15]:
%%sql
SELECT `group`, artifact, `version`, COUNT(*) AS c
FROM artifacts
GROUP BY `group`, artifact, `version`
HAVING c > 1
ORDER BY c DESC;

 * sqlite:///data/maven.db
Done.


group,artifact,version,c
org.bytedeco,javacpp,1.5.6,16
org.bytedeco,artoolkitplus,2.3.1-1.5.9,13
org.bytedeco,ffmpeg,4.4-1.5.6,13
org.bytedeco.javacpp-presets,ffmpeg,4.0.2-1.4.3,12
org.bytedeco,javacpp,1.5.10,11
org.bytedeco,openblas,0.3.26-1.5.10,11
org.bytedeco,opencv,4.9.0-1.5.10,11
org.bytedeco,ffmpeg,6.1.1-1.5.10,9
org.bytedeco,leptonica,1.84.1-1.5.10,9
org.bytedeco,libdc1394,2.2.6-1.5.9,9


In [19]:
%%sql
SELECT * FROM artifacts
WHERE `group` = 'org.bytedeco' AND artifact = 'javacpp'
ORDER BY size DESC;

 * sqlite:///data/maven.db
Done.


group,artifact,version,jar,size,ctime
org.bytedeco,javacpp,1.5.10,javacpp-1.5.10-windows-x86_64.jar,1406379,2024-07-25 18:33:21.958724
org.bytedeco,javacpp,1.5.6,javacpp-1.5.6-windows-x86_64.jar,1305252,2024-08-05 16:42:03.265707
org.bytedeco,javacpp,1.5.6,javacpp-1.5.6-windows-x86.jar,1287718,2024-08-05 16:42:08.945045
org.bytedeco,javacpp,1.5.10,javacpp-1.5.10.jar,506608,2024-07-25 18:29:23.835263
org.bytedeco,javacpp,1.5.9,javacpp-1.5.9.jar,501606,2024-07-25 18:29:22.545120
org.bytedeco,javacpp,1.5.6,javacpp-1.5.6.jar,487566,2024-08-05 16:42:03.689718
org.bytedeco,javacpp,1.4.3,javacpp-1.4.3.jar,372427,2024-08-05 16:41:52.531631
org.bytedeco,javacpp,1.5.10,javacpp-1.5.10-android-x86_64.jar,97254,2024-07-25 18:33:41.640944
org.bytedeco,javacpp,1.5.6,javacpp-1.5.6-android-x86.jar,92111,2024-08-05 16:42:04.723573
org.bytedeco,javacpp,1.5.6,javacpp-1.5.6-android-x86_64.jar,91983,2024-08-05 16:42:05.131763
