In [1]:
# download this: https://dataverse.harvard.edu/file.xhtml?fileId=5745840&version=2.1
# unpack to ../data

import pandas as pd
import matplotlib.pyplot as plt

def load_props(path):
    temp = pd.read_json(path)
    df = pd.json_normalize(temp.files)
    df['quality'] = temp.quality
    df['ratio'] = 100 * df.filesize / df.rawsize
    return df

i18n = {
    "mean": "średnia", 
    "std": "odch. std.",
    'min': 'min',
    '25%': 'Q1',
    '50%': 'mediana',
    '75%': 'Q3',
    'max': 'max'
}
import sys; sys.path.append("..")
from evaldets.postprocess import save_plot

In [2]:
load_props('../data/rate_ssim_byQ/properties_010.json')

Unnamed: 0,image,width,height,channels,origchan,filesize,origsize,rawsize,mean_squared_error,peak_signal_noise_ratio,structural_similarity,quality,ratio
0,000000000139.jpg,640,426,3,3,10045,161811,817920,135.776146,26.802569,0.730682,10,1.228115
1,000000000285.jpg,586,640,3,3,24132,335861,1125120,322.996145,23.038830,0.652713,10,2.144838
2,000000000632.jpg,640,483,3,3,15182,155667,927360,314.487165,23.154774,0.667135,10,1.637120
3,000000000724.jpg,375,500,3,3,12768,130107,562500,229.887748,24.515645,0.735628,10,2.269867
4,000000000776.jpg,428,640,3,3,16493,176410,821760,333.949392,22.893997,0.537188,10,2.007034
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,000000581317.jpg,640,354,3,3,10340,116838,679680,374.348279,22.398045,0.669900,10,1.521304
4996,000000581357.jpg,612,612,3,3,11494,139751,1123632,103.976160,27.961466,0.775341,10,1.022933
4997,000000581482.jpg,640,427,1,3,16425,158235,273280,254.763934,24.069424,0.753820,10,6.010319
4998,000000581615.jpg,478,640,3,3,8524,142905,917760,57.457339,30.537349,0.795018,10,0.928783


In [3]:
_.structural_similarity.describe()

count    5000.000000
mean        0.726269
std         0.071960
min         0.339775
25%         0.683166
50%         0.732002
75%         0.774560
max         0.967155
Name: structural_similarity, dtype: float64

In [4]:
data = {q: load_props(f'../data/rate_ssim_byQ/properties_{q:03d}.json') for q in range(5, 100, 10)}

In [5]:
stats = pd.DataFrame(index=data.keys(), data=[df.structural_similarity.describe() for df in data.values()])
stats.index.name = 'Q'
stats.drop(columns='count', inplace=True)
stats.rename(columns=i18n, inplace=True)
stats

Unnamed: 0_level_0,średnia,odch. std.,min,Q1,mediana,Q3,max
Q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,0.627101,0.08983,0.153293,0.574096,0.630873,0.685953,0.965715
15,0.773825,0.063745,0.359304,0.736047,0.780204,0.817623,0.969382
25,0.826165,0.054568,0.390045,0.794513,0.832005,0.864267,0.97955
35,0.854855,0.048374,0.419113,0.827557,0.859963,0.888305,0.976038
45,0.873986,0.043599,0.443987,0.849793,0.878538,0.903895,0.99096
55,0.886809,0.041301,0.467388,0.864498,0.891524,0.915035,0.99309
65,0.904728,0.035758,0.504513,0.88653,0.90904,0.928686,0.988051
75,0.921144,0.0331,0.576811,0.906112,0.925593,0.942438,0.997151
85,0.956206,0.024949,0.781857,0.943239,0.959744,0.97572,0.998514
95,0.994715,0.002615,0.980783,0.993057,0.994722,0.996941,0.999606


In [6]:
# print(stats.to_latex(float_format='%.4f'))
# FutureWarning: (...) It is recommended instead to use `DataFrame.style.to_latex`
print(stats.style.format(precision=4).to_latex(hrules=True))

\begin{tabular}{lrrrrrrr}
\toprule
 & średnia & odch. std. & min & Q1 & mediana & Q3 & max \\
Q &  &  &  &  &  &  &  \\
\midrule
5 & 0.6271 & 0.0898 & 0.1533 & 0.5741 & 0.6309 & 0.6860 & 0.9657 \\
15 & 0.7738 & 0.0637 & 0.3593 & 0.7360 & 0.7802 & 0.8176 & 0.9694 \\
25 & 0.8262 & 0.0546 & 0.3900 & 0.7945 & 0.8320 & 0.8643 & 0.9795 \\
35 & 0.8549 & 0.0484 & 0.4191 & 0.8276 & 0.8600 & 0.8883 & 0.9760 \\
45 & 0.8740 & 0.0436 & 0.4440 & 0.8498 & 0.8785 & 0.9039 & 0.9910 \\
55 & 0.8868 & 0.0413 & 0.4674 & 0.8645 & 0.8915 & 0.9150 & 0.9931 \\
65 & 0.9047 & 0.0358 & 0.5045 & 0.8865 & 0.9090 & 0.9287 & 0.9881 \\
75 & 0.9211 & 0.0331 & 0.5768 & 0.9061 & 0.9256 & 0.9424 & 0.9972 \\
85 & 0.9562 & 0.0249 & 0.7819 & 0.9432 & 0.9597 & 0.9757 & 0.9985 \\
95 & 0.9947 & 0.0026 & 0.9808 & 0.9931 & 0.9947 & 0.9969 & 0.9996 \\
\bottomrule
\end{tabular}



In [7]:
rstats = pd.DataFrame(index=data.keys(), data=[df.ratio.describe() for df in data.values()])
rstats.index.name = 'Q'
rstats.drop(columns='count', inplace=True)
rstats.rename(columns=i18n, inplace=True)
rstats

Unnamed: 0_level_0,średnia,odch. std.,min,Q1,mediana,Q3,max
Q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,1.203417,1.178461,0.435175,0.799469,1.023688,1.322551,68.804157
15,2.321699,1.47889,0.476106,1.615865,2.093288,2.701794,69.592725
25,3.273402,1.815456,0.514844,2.292189,2.983236,3.850747,70.247243
35,4.12399,2.143494,0.571733,2.897771,3.772111,4.894388,70.835163
45,4.909963,2.447703,0.648297,3.475534,4.514777,5.837077,71.370877
55,5.545185,2.694824,0.694038,3.942193,5.10076,6.554715,71.863656
65,6.712862,3.122354,0.824429,4.837085,6.199407,7.971625,72.523175
75,7.962859,3.587871,0.967994,5.752586,7.37074,9.420905,73.549717
85,11.996273,4.859419,1.870226,8.948176,11.234212,14.177889,76.022761
95,20.46775,7.640941,3.038032,15.346671,19.497766,24.12493,83.827942


In [8]:
# print(rstats.to_latex(float_format='%.2f'))
print(rstats.style.format(precision=2).to_latex(hrules=True))

\begin{tabular}{lrrrrrrr}
\toprule
 & średnia & odch. std. & min & Q1 & mediana & Q3 & max \\
Q &  &  &  &  &  &  &  \\
\midrule
5 & 1.20 & 1.18 & 0.44 & 0.80 & 1.02 & 1.32 & 68.80 \\
15 & 2.32 & 1.48 & 0.48 & 1.62 & 2.09 & 2.70 & 69.59 \\
25 & 3.27 & 1.82 & 0.51 & 2.29 & 2.98 & 3.85 & 70.25 \\
35 & 4.12 & 2.14 & 0.57 & 2.90 & 3.77 & 4.89 & 70.84 \\
45 & 4.91 & 2.45 & 0.65 & 3.48 & 4.51 & 5.84 & 71.37 \\
55 & 5.55 & 2.69 & 0.69 & 3.94 & 5.10 & 6.55 & 71.86 \\
65 & 6.71 & 3.12 & 0.82 & 4.84 & 6.20 & 7.97 & 72.52 \\
75 & 7.96 & 3.59 & 0.97 & 5.75 & 7.37 & 9.42 & 73.55 \\
85 & 12.00 & 4.86 & 1.87 & 8.95 & 11.23 & 14.18 & 76.02 \\
95 & 20.47 & 7.64 & 3.04 & 15.35 & 19.50 & 24.12 & 83.83 \\
\bottomrule
\end{tabular}



In [9]:
bigdf = pd.concat(data.values())

In [10]:
plt.rcParams["figure.figsize"] = (10, 10)
bigdf.plot.scatter(x="ratio", y="structural_similarity", xlim=(0, 100), ylim=(0, 1), marker='.')
plt.gca().set_xlabel("stosunek rozmiaru skompresowanego do surowego (%)")
plt.gca().set_ylabel("indeks podobieństwa strukturalnego")
save_plot(plt.gcf(), "11-rate-ssim-tables-size-ssim-scatter", p=0, c=1)

Saved 11-rate-ssim-tables-size-ssim-scatter.png
