## Testing fragments from different editions

In [92]:
import pandas as pd
import numpy as np

import re

from sklearn.preprocessing import StandardScaler

import logging

logging.basicConfig(level="INFO")
logger = logging.getLogger("ruzicka")

from ruzicka.BDIVerifier import BDIVerifier

In [119]:
# load preprocessed data

corpus = pd.read_csv('04_tests/ed1780_2k_200mfw_rfreq.csv') # rel freq

corpus

Unnamed: 0,work,author,chunk_num,tag,de,des,et,la,les,vous,...,voit,rendre,beaucoup,seul,ici,mieux,donner,enfin,voir,cependant
0,Avis,Baudeau,0,0__Baudeau_Avis,3.45,2.20,2.50,1.40,2.35,0.65,...,0.05,0.00,0.05,0.05,0.05,0.00,0.05,0.20,0.00,0.00
1,Eclaircissemens,Baudeau,0,0__Baudeau_Eclaircissemens,4.40,2.85,2.80,2.60,2.40,1.65,...,0.05,0.00,0.00,0.05,0.00,0.00,0.05,0.00,0.00,0.00
2,Explication,Baudeau,0,0__Baudeau_Explication,4.90,2.60,3.80,3.10,4.65,0.95,...,0.00,0.00,0.05,0.10,0.10,0.00,0.05,0.05,0.05,0.00
3,Idees sur l-administration,Baudeau,0,0__Baudeau_Idees sur l-administration,5.60,1.90,2.20,3.65,1.85,0.10,...,0.00,0.00,0.10,0.10,0.00,0.05,0.05,0.10,0.00,0.05
4,Idees sur la puissance,Baudeau,0,0__Baudeau_Idees sur la puissance,4.90,1.85,2.50,2.35,2.75,0.10,...,0.00,0.10,0.05,0.00,0.00,0.10,0.00,0.10,0.05,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4469,elements T3,Marmontel,96,96__Marmontel_elements T3,4.15,1.20,3.35,3.00,1.15,0.00,...,0.05,0.10,0.00,0.25,0.00,0.05,0.05,0.05,0.00,0.00
4470,Histoire Generale T19,Deleyre,97,97__Deleyre_Histoire Generale T19,5.25,1.85,0.75,3.75,2.65,0.00,...,0.00,0.05,0.05,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4471,elements T3,Marmontel,97,97__Marmontel_elements T3,4.60,0.95,3.10,3.40,1.55,0.00,...,0.00,0.10,0.00,0.00,0.00,0.00,0.05,0.00,0.00,0.00
4472,Histoire Generale T19,Deleyre,98,98__Deleyre_Histoire Generale T19,4.90,2.95,0.75,3.25,3.20,0.05,...,0.05,0.00,0.20,0.00,0.00,0.05,0.00,0.00,0.05,0.05


In [120]:
raw_df = corpus.iloc[:, :4] # leaves metadata
X = corpus.iloc[:, 4:] # leaves only word columns

raw_df

Unnamed: 0,work,author,chunk_num,tag
0,Avis,Baudeau,0,0__Baudeau_Avis
1,Eclaircissemens,Baudeau,0,0__Baudeau_Eclaircissemens
2,Explication,Baudeau,0,0__Baudeau_Explication
3,Idees sur l-administration,Baudeau,0,0__Baudeau_Idees sur l-administration
4,Idees sur la puissance,Baudeau,0,0__Baudeau_Idees sur la puissance
...,...,...,...,...
4469,elements T3,Marmontel,96,96__Marmontel_elements T3
4470,Histoire Generale T19,Deleyre,97,97__Deleyre_Histoire Generale T19
4471,elements T3,Marmontel,97,97__Marmontel_elements T3
4472,Histoire Generale T19,Deleyre,98,98__Deleyre_Histoire Generale T19


In [121]:
# numer of rows and columns in the X (word frequencies subset)
X.shape

(4474, 200)

In [122]:
# create numeric author labels
labels, label_uniques = raw_df.author.factorize()
raw_df.insert(1, "author_label", labels)
raw_df

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Avis,0,Baudeau,0,0__Baudeau_Avis
1,Eclaircissemens,0,Baudeau,0,0__Baudeau_Eclaircissemens
2,Explication,0,Baudeau,0,0__Baudeau_Explication
3,Idees sur l-administration,0,Baudeau,0,0__Baudeau_Idees sur l-administration
4,Idees sur la puissance,0,Baudeau,0,0__Baudeau_Idees sur la puissance
...,...,...,...,...,...
4469,elements T3,11,Marmontel,96,96__Marmontel_elements T3
4470,Histoire Generale T19,3,Deleyre,97,97__Deleyre_Histoire Generale T19
4471,elements T3,11,Marmontel,97,97__Marmontel_elements T3
4472,Histoire Generale T19,3,Deleyre,98,98__Deleyre_Histoire Generale T19


### NB CHANGE THE LABEL

In [123]:
# select an "unknown" work
# ed1770-CH1774-CH1780
# ed1774-CH1780

problems = raw_df[raw_df.work == "ed1780"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "ed1780"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,ed1780,7,HDI,0,0__HDI_ed1780
1,ed1780,7,HDI,1,1__HDI_ed1780
2,ed1780,7,HDI,10,10__HDI_ed1780
3,ed1780,7,HDI,11,11__HDI_ed1780
4,ed1780,7,HDI,12,12__HDI_ed1780
5,ed1780,7,HDI,13,13__HDI_ed1780
6,ed1780,7,HDI,14,14__HDI_ed1780
7,ed1780,7,HDI,15,15__HDI_ed1780
8,ed1780,7,HDI,16,16__HDI_ed1780
9,ed1780,7,HDI,17,17__HDI_ed1780


In [124]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "ed1780"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "ed1780"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Avis,0,Baudeau,0,0__Baudeau_Avis
1,Eclaircissemens,0,Baudeau,0,0__Baudeau_Eclaircissemens
2,Explication,0,Baudeau,0,0__Baudeau_Explication
3,Idees sur l-administration,0,Baudeau,0,0__Baudeau_Idees sur l-administration
4,Idees sur la puissance,0,Baudeau,0,0__Baudeau_Idees sur la puissance
...,...,...,...,...,...
4426,elements T3,11,Marmontel,96,96__Marmontel_elements T3
4427,Histoire Generale T19,3,Deleyre,97,97__Deleyre_Histoire Generale T19
4428,elements T3,11,Marmontel,97,97__Marmontel_elements T3
4429,Histoire Generale T19,3,Deleyre,98,98__Deleyre_Histoire Generale T19


**Scaling**

In [125]:
sts = StandardScaler(with_mean=False).fit(rest_X)
rest_scaled_X = sts.transform(rest_X)
problems_scaled_X = sts.transform(problems_X)

**Verification**

In [126]:
# check unique authors
label_uniques.values

array(['Baudeau', 'Chastellux', 'Condorcet', 'Deleyre', 'dHolbach',
       'Diderot', 'Guibert', 'HDI', 'Jaucourt', 'Jussieu', 'La Grange',
       'Marmontel', 'Meister', 'Morellet', 'Naigeon', 'Pechmeja',
       'Raynal', 'Rivière', 'Saint-Lambert'], dtype=object)

In [127]:
# set verifier
rng = np.random.default_rng(42)

bdi_mm = BDIVerifier(
    metric='minmax', nb_bootstrap_iter=1000, rnd_prop=0.35, random_state=rng
)

In [128]:
# fit 
bdi_mm.fit(rest_scaled_X, rest.author_label)

01/22/2025 04:09:33 [ruzicka:INFO] Fitting on 4431 documents...


In [129]:
for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    )

01/22/2025 04:09:39 [ruzicka:INFO] Predicting on 43 documents


Testing against Baudeau


01/22/2025 04:09:48 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.736 0.896 0.686 0.712 0.538 0.878 0.606 0.787 0.902 0.67  0.827 0.864
 0.633 0.613 0.642 0.816 0.894 0.564 0.694 0.792 0.88  0.865 0.844 0.622
 0.805 0.373 0.484 0.664 0.933 0.92  0.668 0.857 0.851 0.847 0.685 0.713
 0.744 0.534 0.858 0.611 0.838 0.67  0.863]
Testing against Chastellux


01/22/2025 04:09:58 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.964 0.879 0.928 0.943 0.95  0.96  0.924 0.972 0.886 0.786 0.943 0.801
 0.922 0.937 0.831 0.884 0.847 0.781 0.78  0.923 0.983 0.968 0.989 0.979
 0.93  0.944 0.921 0.924 0.841 0.751 0.896 0.955 0.95  0.925 0.859 0.876
 0.872 0.827 0.963 0.938 0.906 0.935 0.963]
Testing against Condorcet


01/22/2025 04:10:30 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.959 0.991 0.989 0.98  0.955 0.955 0.972 0.995 0.981 0.976 0.972 0.987
 0.888 0.947 0.986 0.993 0.964 0.981 0.996 0.977 0.997 0.993 0.984 0.958
 0.994 0.979 0.991 0.979 0.946 0.97  0.977 0.98  0.984 0.989 0.988 0.842
 0.992 0.987 0.943 0.979 0.924 0.96  0.958]
Testing against Deleyre


01/22/2025 04:10:44 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.967 0.824 0.913 0.921 0.94  0.96  0.895 0.935 0.915 0.882 0.907 0.825
 0.886 0.951 0.948 0.949 0.96  0.848 0.773 0.967 0.968 0.884 0.914 0.94
 0.958 0.992 0.96  0.931 0.987 0.979 0.945 0.99  0.999 0.919 0.795 0.871
 0.874 0.901 0.946 0.931 0.981 0.966 0.996]
Testing against dHolbach


01/22/2025 04:11:12 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.975 0.919 0.959 0.986 0.99  0.945 0.906 0.951 0.951 0.984 0.964 0.927
 0.768 0.863 0.998 0.965 0.997 0.985 0.952 0.959 0.942 0.965 0.968 0.597
 0.945 0.971 0.947 0.96  0.924 0.915 0.971 0.925 0.965 0.958 0.978 0.47
 0.81  0.942 0.921 0.919 0.902 0.979 0.919]
Testing against Diderot


01/22/2025 04:11:34 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.958 0.992 0.987 0.977 0.975 0.99  0.991 0.988 0.992 0.992 0.993 1.
 0.998 1.    0.912 0.991 0.971 0.998 0.982 0.983 0.882 0.913 0.809 0.997
 0.961 0.961 0.984 0.994 0.929 0.984 0.979 0.888 0.947 0.932 0.998 0.999
 0.996 0.998 0.999 0.974 0.993 0.987 0.992]
Testing against Guibert


01/22/2025 04:11:45 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.791 0.843 0.728 0.597 0.737 0.762 0.818 0.658 0.86  0.953 0.815 0.727
 0.838 0.732 0.623 0.818 0.675 0.704 0.473 0.682 0.871 0.816 0.875 0.893
 0.857 0.757 0.738 0.886 0.788 0.671 0.901 0.856 0.802 0.823 0.785 0.933
 0.95  0.853 0.569 0.778 0.762 0.869 0.827]
Testing against HDI


01/22/2025 04:11:48 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.901 0.948 0.882 0.992 0.835 0.908 0.988 0.948 0.806 0.804 0.95  0.73
 0.672 0.968 0.819 0.782 0.838 0.903 0.801 0.906 0.894 0.84  0.939 0.867
 0.88  0.664 0.94  0.921 0.856 0.922 0.798 0.827 0.973 0.983 0.733 0.257
 0.547 0.715 0.86  0.801 0.809 0.871 0.718]
Testing against Jaucourt


01/22/2025 04:11:58 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.966 0.56  0.81  0.849 0.722 0.875 0.77  0.625 0.915 0.67  0.731 0.651
 0.699 0.798 0.742 0.826 0.802 0.796 0.58  0.761 0.745 0.605 0.597 0.713
 0.884 0.94  0.884 0.708 0.983 0.931 0.699 0.858 0.644 0.656 0.72  0.763
 0.889 0.75  0.818 0.968 0.901 0.796 0.845]
Testing against Jussieu


01/22/2025 04:12:06 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.788 0.213 0.304 0.293 0.346 0.22  0.192 0.476 0.266 0.262 0.372 0.268
 0.082 0.289 0.217 0.495 0.158 0.106 0.117 0.136 0.353 0.193 0.101 0.263
 0.37  0.21  0.401 0.164 0.834 0.271 0.395 0.497 0.405 0.317 0.104 0.427
 0.367 0.341 0.491 0.455 0.313 0.603 0.881]
Testing against La Grange


01/22/2025 04:12:12 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.152 0.097 0.069 0.118 0.212 0.095 0.041 0.116 0.058 0.036 0.156 0.086
 0.046 0.107 0.043 0.241 0.044 0.052 0.086 0.079 0.256 0.063 0.045 0.186
 0.135 0.105 0.036 0.077 0.271 0.252 0.096 0.113 0.102 0.043 0.069 0.143
 0.054 0.073 0.082 0.062 0.127 0.18  0.57 ]
Testing against Marmontel


01/22/2025 04:12:31 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.834 0.94  0.917 0.85  0.91  0.939 0.873 0.862 0.985 0.887 0.809 0.972
 0.991 0.976 0.821 0.879 0.763 0.939 0.887 0.914 0.716 0.867 0.799 0.972
 0.888 0.904 0.916 0.95  0.776 0.949 0.851 0.698 0.776 0.931 0.955 0.938
 0.894 0.972 0.961 0.966 0.935 0.93  0.807]
Testing against Meister


01/22/2025 04:12:35 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.145 0.328 0.246 0.09  0.261 0.119 0.278 0.476 0.365 0.238 0.36  0.407
 0.349 0.065 0.093 0.347 0.108 0.465 0.241 0.227 0.088 0.549 0.202 0.366
 0.068 0.207 0.049 0.125 0.004 0.107 0.036 0.061 0.102 0.154 0.312 0.091
 0.392 0.411 0.358 0.12  0.484 0.227 0.146]
Testing against Morellet


01/22/2025 04:12:42 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.84  0.929 0.87  0.672 0.76  0.759 0.727 0.718 0.727 0.848 0.888 0.827
 0.84  0.844 0.789 0.723 0.697 0.681 0.896 0.896 0.791 0.867 0.747 0.932
 0.729 0.701 0.901 0.839 0.596 0.79  0.915 0.693 0.768 0.865 0.855 0.931
 0.942 0.752 0.737 0.834 0.655 0.592 0.639]
Testing against Naigeon


01/22/2025 04:12:48 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.473 0.893 0.825 0.654 0.661 0.864 0.613 0.533 0.544 0.741 0.731 0.696
 0.709 0.559 0.756 0.716 0.731 0.906 0.864 0.862 0.375 0.6   0.273 0.754
 0.385 0.612 0.523 0.557 0.312 0.701 0.404 0.244 0.534 0.668 0.761 0.799
 0.751 0.725 0.675 0.485 0.773 0.589 0.349]
Testing against Pechmeja


01/22/2025 04:12:53 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.368 0.633 0.671 0.648 0.505 0.759 0.327 0.333 0.473 0.508 0.263 0.163
 0.576 0.276 0.289 0.288 0.293 0.326 0.181 0.53  0.551 0.412 0.465 0.747
 0.419 0.401 0.28  0.224 0.415 0.223 0.503 0.38  0.236 0.406 0.584 0.189
 0.607 0.477 0.215 0.305 0.279 0.18  0.428]
Testing against Raynal


01/22/2025 04:13:07 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.946 0.414 0.628 0.953 0.728 0.843 0.962 0.749 0.585 0.813 0.694 0.717
 0.829 0.791 0.838 0.851 0.848 0.507 0.535 0.764 0.646 0.627 0.966 0.821
 0.708 0.941 0.95  0.928 0.966 0.739 0.949 0.983 0.834 0.877 0.8   0.485
 0.822 0.826 0.834 0.961 0.861 0.885 0.809]
Testing against Rivière


01/22/2025 04:13:15 [ruzicka:INFO] Predicting on 43 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.349 0.838 0.619 0.344 0.559 0.618 0.511 0.159 0.619 0.706 0.778 0.669
 0.31  0.405 0.588 0.542 0.518 0.866 0.91  0.823 0.806 0.932 0.669 0.283
 0.739 0.28  0.474 0.478 0.216 0.511 0.366 0.404 0.697 0.734 0.772 0.552
 0.676 0.609 0.431 0.461 0.295 0.625 0.593]
Testing against Saint-Lambert
Bootstrap Match Strength (one per chunk, 0-1.0): [0.835 0.896 0.967 0.933 0.98  0.885 0.934 0.895 0.897 0.83  0.863 0.818
 0.912 0.751 0.867 0.783 0.748 0.855 0.876 0.973 0.93  0.924 0.944 0.916
 0.655 0.851 0.868 0.939 0.777 0.753 0.952 0.917 0.845 0.905 0.861 0.767
 0.803 0.945 0.693 0.848 0.926 0.879 0.884]


### NB CHANGE PATH FOR NEW TESTS

In [130]:
authors = label_uniques.values

fh = ''

for a in authors:

    print(a)
    
    bdi_mm.predict_proba(
        problems_scaled_X, [label_uniques.get_loc(a)] * problems_scaled_X.shape[0]
    )

    
    x = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))
    
    fh = '04_tests/tests/ed1780_vs_' + a + '.csv'

    x.to_csv(fh)

01/22/2025 04:13:36 [ruzicka:INFO] Predicting on 43 documents


Baudeau


01/22/2025 04:13:45 [ruzicka:INFO] Predicting on 43 documents


Chastellux


01/22/2025 04:13:55 [ruzicka:INFO] Predicting on 43 documents


Condorcet


01/22/2025 04:14:29 [ruzicka:INFO] Predicting on 43 documents


Deleyre


01/22/2025 04:14:43 [ruzicka:INFO] Predicting on 43 documents


dHolbach


01/22/2025 04:15:11 [ruzicka:INFO] Predicting on 43 documents


Diderot


01/22/2025 04:15:33 [ruzicka:INFO] Predicting on 43 documents


Guibert


01/22/2025 04:15:44 [ruzicka:INFO] Predicting on 43 documents


HDI


01/22/2025 04:15:47 [ruzicka:INFO] Predicting on 43 documents


Jaucourt


01/22/2025 04:15:57 [ruzicka:INFO] Predicting on 43 documents


Jussieu


01/22/2025 04:16:05 [ruzicka:INFO] Predicting on 43 documents


La Grange


01/22/2025 04:16:11 [ruzicka:INFO] Predicting on 43 documents


Marmontel


01/22/2025 04:16:31 [ruzicka:INFO] Predicting on 43 documents


Meister


01/22/2025 04:16:34 [ruzicka:INFO] Predicting on 43 documents


Morellet


01/22/2025 04:16:41 [ruzicka:INFO] Predicting on 43 documents


Naigeon


01/22/2025 04:16:48 [ruzicka:INFO] Predicting on 43 documents


Pechmeja


01/22/2025 04:16:52 [ruzicka:INFO] Predicting on 43 documents


Raynal


01/22/2025 04:17:06 [ruzicka:INFO] Predicting on 43 documents


Rivière


01/22/2025 04:17:14 [ruzicka:INFO] Predicting on 43 documents


Saint-Lambert
