### Test ground truth attribution
Diderot vs Diderot from *Correspondences littéraires*
  
Settings:
- smaller corpus (each author has only 2 works, only 5 authors are candidates);
- test l1 and l2 normalisation.

In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

import re

from sklearn.preprocessing import StandardScaler
# from sklearn.feature_extraction.text import TfidfVectorizer

import logging

logging.basicConfig(level="INFO")
logger = logging.getLogger("ruzicka")

from ruzicka.BDIVerifier import BDIVerifier

In [2]:
# load preprocessed data

corpus = pd.read_csv('02_tests/test_diderotII_rfreq.csv') # rel freq
# corpus = pd.read_csv('02_tests/test_diderotII_l2norm.csv') # l2 norm

corpus = corpus.iloc[:, 1:] # remove first column (should have not been saved with it!)

corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,la,a,des,que,...,avant,ici,liberté,chez,fois,commerce,compagnie,voir,intérêt,prix
0,Analyse de la philosophie,Deleyre,0,0__Deleyre_Analyse de la philosophie,4.1,2.6,4.1,1.2,1.6,1.5,...,0.2,0.0,0.0,0.1,0.0,0.0,0.0,0.1,0.1,0.0
1,eloge de M Roux,Deleyre,0,0__Deleyre_eloge de M Roux,4.7,2.3,2.5,1.0,1.5,1.3,...,0.0,0.1,0.0,0.1,0.0,0.2,0.0,0.1,0.0,0.0
2,Le bon sens,dHolbach,0,0__dHolbach_Le bon sens,3.6,3.6,1.6,0.2,3.7,2.5,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.1,0.0
3,Le Christianisme,dHolbach,0,0__dHolbach_Le Christianisme,4.0,3.4,2.5,0.5,3.0,2.3,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.0
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,4.4,2.8,3.2,0.8,0.9,1.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,Recherches philosophiques,Diderot,9,9__Diderot_Recherches philosophiques,3.7,4.0,1.8,0.9,1.1,2.8,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.0
352,Methode naturelle,Jussieu,9,9__Jussieu_Methode naturelle,3.4,3.0,3.7,0.4,2.6,1.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
353,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,4.9,2.0,2.0,0.2,2.4,1.1,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
354,ecole 1,Raynal,9,9__Raynal_ecole 1,4.9,3.2,2.9,1.4,0.7,1.2,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Splitting data

In [3]:
raw_df = corpus.iloc[:, :4] # leaves metadata
X = corpus.iloc[:, 4:] # leaves only word columns

raw_df

Unnamed: 0,work,author,chunk_num,tag
0,Analyse de la philosophie,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,dHolbach,0,0__dHolbach_Le Christianisme
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...
351,Recherches philosophiques,Diderot,9,9__Diderot_Recherches philosophiques
352,Methode naturelle,Jussieu,9,9__Jussieu_Methode naturelle
353,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6
354,ecole 1,Raynal,9,9__Raynal_ecole 1


In [4]:
labels, label_uniques = raw_df.author.factorize()
raw_df.insert(1, "author_label", labels)
raw_df

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Analyse de la philosophie,0,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,0,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,1,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,1,dHolbach,0,0__dHolbach_Le Christianisme
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
351,Recherches philosophiques,3,Diderot,9,9__Diderot_Recherches philosophiques
352,Methode naturelle,4,Jussieu,9,9__Jussieu_Methode naturelle
353,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
354,ecole 1,5,Raynal,9,9__Raynal_ecole 1


In [5]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
1,Pensees philosophiques,2,Diderot II,1,1__Diderot II_Pensees philosophiques
2,Pensees philosophiques,2,Diderot II,2,2__Diderot II_Pensees philosophiques
3,Pensees philosophiques,2,Diderot II,3,3__Diderot II_Pensees philosophiques
4,Pensees philosophiques,2,Diderot II,4,4__Diderot II_Pensees philosophiques
5,Pensees philosophiques,2,Diderot II,5,5__Diderot II_Pensees philosophiques
6,Pensees philosophiques,2,Diderot II,6,6__Diderot II_Pensees philosophiques
7,Pensees philosophiques,2,Diderot II,7,7__Diderot II_Pensees philosophiques
8,Pensees philosophiques,2,Diderot II,8,8__Diderot II_Pensees philosophiques
9,Pensees philosophiques,2,Diderot II,9,9__Diderot II_Pensees philosophiques


In [6]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Pensees philosophiques"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Pensees philosophiques"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Analyse de la philosophie,0,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,0,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,1,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,1,dHolbach,0,0__dHolbach_Le Christianisme
4,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
...,...,...,...,...,...
341,Recherches philosophiques,3,Diderot,9,9__Diderot_Recherches philosophiques
342,Methode naturelle,4,Jussieu,9,9__Jussieu_Methode naturelle
343,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
344,ecole 1,5,Raynal,9,9__Raynal_ecole 1


#### Scaling

In [7]:
ss = StandardScaler(with_mean=False).fit(rest_X)

In [8]:
rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

#### Verification

In [9]:
rng = np.random.default_rng(42)

In [10]:
bdi_mm = BDIVerifier(
    metric="minmax", nb_bootstrap_iter=1000, rnd_prop=0.35, random_state=rng
)

In [15]:
help(BDIVerifier.predict_proba)

Help on function predict_proba in module ruzicka.BDIVerifier:

predict_proba(self, test_X: Collection[Collection[float]], test_y: Collection[int], nb_imposters: int = 30) -> numpy.ndarray[typing.Any, numpy.dtype[numpy.float64]]
    Given a `test_vector` and an integer representing a target authors
    (`target_int`), we retrieve the distance to the nearest document in the
    training data, which is NOT authored by the target author. In the
    distance calculation, we only take into account the feature values
    specified in `rnd_feature_idxs` (if the latter parameter is specified);
    else, we use the entire feature space. Note that we each time sample a
    random number of imposters from the available training documents, the
    number of which is specified by `nb_imposters`.
    
    We apply the normal verification method, using self.nb_bootstrap_iter
    iterations. In this case, the returned probabilities represent the
    proportions of bootstraps in which the target_author 

In [11]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

01/20/2025 12:10:34 [ruzicka:INFO] Fitting on 346 documents...


In [12]:
label_uniques.values

array(['Deleyre', 'dHolbach', 'Diderot II', 'Diderot', 'Jussieu',
       'Raynal'], dtype=object)

In [13]:
for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

01/20/2025 12:10:55 [ruzicka:INFO] Predicting on 10 documents


Testing against Deleyre


01/20/2025 12:10:56 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.684 0.347 0.526 0.531 0.62  0.636 0.708 0.664 0.317 0.754]
Testing against dHolbach


01/20/2025 12:10:58 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.923 0.833 0.578 0.828 0.776 0.841 0.566 0.39  0.933 0.546]
Testing against Diderot II


01/20/2025 12:10:59 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.111 0.413 0.334 0.043 0.093 0.014 0.214 0.344 0.106 0.051]
Testing against Diderot


01/20/2025 12:11:00 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.902 0.938 0.993 0.961 0.922 0.808 0.951 0.857 0.891 0.905]
Testing against Jussieu


01/20/2025 12:11:01 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.005 0.002 0.028 0.018 0.002 0.045 0.017 0.017 0.029 0.014]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.177 0.306 0.368 0.296 0.7   0.706 0.472 0.779 0.261 0.836]


Re-run against Diderot, Diderot-II, & Raynal

In [14]:
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot")] * problems_scaled_X.shape[0]
)

01/20/2025 12:11:11 [ruzicka:INFO] Predicting on 10 documents


array([0.918, 0.928, 0.993, 0.972, 0.915, 0.795, 0.945, 0.871, 0.864,
       0.88 ])

In [15]:
pens_vs_ddrt = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrt.to_csv('02_tests/bdi/1_rf_pens_vs_ddrt.csv')

pens_vs_ddrt

Unnamed: 0,0__Diderot II_Pensees philosophiques,1__Diderot II_Pensees philosophiques,2__Diderot II_Pensees philosophiques,3__Diderot II_Pensees philosophiques,4__Diderot II_Pensees philosophiques,5__Diderot II_Pensees philosophiques,6__Diderot II_Pensees philosophiques,7__Diderot II_Pensees philosophiques,8__Diderot II_Pensees philosophiques,9__Diderot II_Pensees philosophiques
0,0.054510,0.085423,0.158263,0.125978,0.028657,0.078801,0.150081,0.036402,0.147157,0.100104
1,0.040253,0.086889,0.104641,0.171063,0.114285,0.063475,0.053423,0.044772,0.068682,0.059699
2,0.050754,0.199405,0.108893,0.221370,0.041610,0.092526,0.060818,0.048395,0.047042,0.110849
3,0.061768,0.086331,0.097578,0.105181,0.060179,0.054855,0.184041,0.056066,0.034706,-0.014236
4,0.123653,0.106335,0.174525,0.045006,0.003444,0.102206,0.232397,0.131826,0.022920,0.121374
...,...,...,...,...,...,...,...,...,...,...
995,0.149250,0.067858,0.159068,0.119465,0.000608,0.025206,0.112651,-0.005066,-0.035967,0.044840
996,0.083927,0.055946,0.090376,0.128799,-0.010479,0.023201,0.127194,-0.037483,-0.020471,-0.039000
997,0.038035,0.213524,0.041836,0.126801,0.060022,0.010191,0.147992,0.097100,0.013685,0.003123
998,0.039642,0.120619,0.184225,0.039920,0.041349,0.088107,0.121786,0.037820,0.005416,-0.001027


In [16]:
# Diderot II
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot II")] * problems_scaled_X.shape[0]
)

pens_vs_ddrtii = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrtii.to_csv('02_tests/bdi/1_rf_pens_vs_ddrt-ii.csv')

pens_vs_ddrtii

01/20/2025 12:12:16 [ruzicka:INFO] Predicting on 10 documents


Unnamed: 0,0__Diderot II_Pensees philosophiques,1__Diderot II_Pensees philosophiques,2__Diderot II_Pensees philosophiques,3__Diderot II_Pensees philosophiques,4__Diderot II_Pensees philosophiques,5__Diderot II_Pensees philosophiques,6__Diderot II_Pensees philosophiques,7__Diderot II_Pensees philosophiques,8__Diderot II_Pensees philosophiques,9__Diderot II_Pensees philosophiques
0,-0.029730,0.137999,0.107483,-0.043598,-0.055467,-0.120559,-0.009669,-0.036548,-0.142268,-0.118123
1,-0.096670,-0.066679,0.131982,-0.135062,-0.042431,-0.184263,-0.160050,-0.010536,-0.066328,-0.044506
2,-0.040938,-0.048600,0.015201,-0.050146,-0.018570,-0.000107,-0.058353,-0.092682,0.033954,-0.067498
3,-0.067636,0.086377,0.134424,-0.147463,-0.132600,-0.193250,-0.281911,-0.038668,-0.093614,-0.100859
4,-0.042687,-0.033928,-0.066663,-0.077181,-0.086530,-0.232003,-0.128481,-0.114887,-0.027711,-0.058467
...,...,...,...,...,...,...,...,...,...,...
995,-0.091177,0.216342,-0.108722,-0.065601,0.003539,-0.087102,-0.194399,0.054646,-0.091849,-0.058840
996,-0.112859,-0.097112,-0.059789,-0.153458,0.091415,-0.208889,0.005412,-0.100206,-0.130755,-0.067264
997,-0.000845,-0.148377,-0.207301,-0.171104,-0.125447,-0.062025,0.082987,0.099230,-0.120421,-0.074489
998,-0.043211,-0.036021,-0.088631,-0.110882,-0.089048,-0.041536,-0.025050,0.106654,-0.004197,-0.081304


In [17]:
# Raynal
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Raynal")] * problems_scaled_X.shape[0]
)

pens_vs_ray = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ray.to_csv('02_tests/bdi/1_rf_pens_vs_ray.csv')

pens_vs_ray

01/20/2025 12:12:25 [ruzicka:INFO] Predicting on 10 documents


Unnamed: 0,0__Diderot II_Pensees philosophiques,1__Diderot II_Pensees philosophiques,2__Diderot II_Pensees philosophiques,3__Diderot II_Pensees philosophiques,4__Diderot II_Pensees philosophiques,5__Diderot II_Pensees philosophiques,6__Diderot II_Pensees philosophiques,7__Diderot II_Pensees philosophiques,8__Diderot II_Pensees philosophiques,9__Diderot II_Pensees philosophiques
0,-0.010482,-0.039172,-0.033788,-0.015305,-0.080154,0.090778,0.056191,0.068277,-0.112810,-0.033639
1,0.041606,0.015822,-0.001305,-0.115803,0.028665,0.022910,0.038299,0.041512,0.093720,0.052881
2,-0.077173,-0.045933,0.099739,-0.071740,0.147625,-0.016048,0.035325,0.036358,0.010864,0.026636
3,-0.097067,-0.103113,-0.083747,0.019069,-0.015252,0.029356,-0.088447,0.089388,0.051438,0.069044
4,-0.111944,-0.012657,-0.099410,-0.000190,0.076769,0.029912,-0.002601,-0.041376,-0.032564,0.173594
...,...,...,...,...,...,...,...,...,...,...
995,0.083482,0.008372,0.118408,-0.090845,0.130168,0.051758,0.044197,-0.043223,-0.039120,-0.014104
996,-0.038586,-0.034093,0.024843,0.027448,0.163789,0.024190,0.016639,0.091065,0.003571,-0.032039
997,-0.017926,-0.039719,0.039829,-0.014942,0.041399,0.003686,0.045205,0.067610,-0.001687,0.017463
998,-0.059070,-0.090002,0.106069,-0.049022,-0.033245,0.077571,0.030573,0.031158,-0.045043,0.070101


## L2 norm

Same tests but with another freq normalisation

In [19]:
# load preprocessed data

# corpus = pd.read_csv('0_tests_diderot-ii/test_diderotII_rfreq.csv') # rel freq
corpus = pd.read_csv('02_tests/test_diderotII_l2norm.csv') # l2 norm

corpus = corpus.iloc[:, 1:] # remove first column

corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,la,a,des,que,...,avant,ici,liberté,chez,fois,commerce,compagnie,voir,intérêt,prix
0,Analyse de la philosophie,Deleyre,0,0__Deleyre_Analyse de la philosophie,0.044484,0.048446,0.071805,0.067723,0.043527,0.045487,...,0.152499,0.000000,0.000000,0.089443,0.000000,0.000000,0.0,0.074536,0.109109,0.000000
1,eloge de M Roux,Deleyre,0,0__Deleyre_eloge de M Roux,0.050993,0.042856,0.043783,0.056436,0.040807,0.039422,...,0.000000,0.083333,0.000000,0.089443,0.000000,0.294884,0.0,0.074536,0.000000,0.000000
2,Le bon sens,dHolbach,0,0__dHolbach_Le bon sens,0.039059,0.067079,0.028021,0.011287,0.100656,0.075812,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.074536,0.109109,0.000000
3,Le Christianisme,dHolbach,0,0__dHolbach_Le Christianisme,0.043399,0.063352,0.043783,0.028218,0.081613,0.069747,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.074536,0.000000,0.000000
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,0.047738,0.052172,0.056043,0.045149,0.024484,0.057617,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.074536,0.109109,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,Recherches philosophiques,Diderot,9,9__Diderot_Recherches philosophiques,0.040144,0.074532,0.031524,0.050792,0.029925,0.084910,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.074536,0.000000,0.000000
352,Methode naturelle,Jussieu,9,9__Jussieu_Methode naturelle,0.036889,0.055899,0.064799,0.022574,0.070731,0.042455,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.074536,0.000000,0.000000
353,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,0.053163,0.037266,0.035027,0.011287,0.065290,0.033357,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.000000,0.000000,0.000000
354,ecole 1,Raynal,9,9__Raynal_ecole 1,0.053163,0.059626,0.050789,0.079010,0.019043,0.036390,...,0.000000,0.083333,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [20]:
raw_df = corpus.iloc[:, :4] # leaves metadata
X = corpus.iloc[:, 4:] # leaves only word columns

raw_df

Unnamed: 0,work,author,chunk_num,tag
0,Analyse de la philosophie,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,dHolbach,0,0__dHolbach_Le Christianisme
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...
351,Recherches philosophiques,Diderot,9,9__Diderot_Recherches philosophiques
352,Methode naturelle,Jussieu,9,9__Jussieu_Methode naturelle
353,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6
354,ecole 1,Raynal,9,9__Raynal_ecole 1


In [21]:
labels, label_uniques = raw_df.author.factorize()
raw_df.insert(1, "author_label", labels)
raw_df

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Analyse de la philosophie,0,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,0,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,1,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,1,dHolbach,0,0__dHolbach_Le Christianisme
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
351,Recherches philosophiques,3,Diderot,9,9__Diderot_Recherches philosophiques
352,Methode naturelle,4,Jussieu,9,9__Jussieu_Methode naturelle
353,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
354,ecole 1,5,Raynal,9,9__Raynal_ecole 1


In [22]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
1,Pensees philosophiques,2,Diderot II,1,1__Diderot II_Pensees philosophiques
2,Pensees philosophiques,2,Diderot II,2,2__Diderot II_Pensees philosophiques
3,Pensees philosophiques,2,Diderot II,3,3__Diderot II_Pensees philosophiques
4,Pensees philosophiques,2,Diderot II,4,4__Diderot II_Pensees philosophiques
5,Pensees philosophiques,2,Diderot II,5,5__Diderot II_Pensees philosophiques
6,Pensees philosophiques,2,Diderot II,6,6__Diderot II_Pensees philosophiques
7,Pensees philosophiques,2,Diderot II,7,7__Diderot II_Pensees philosophiques
8,Pensees philosophiques,2,Diderot II,8,8__Diderot II_Pensees philosophiques
9,Pensees philosophiques,2,Diderot II,9,9__Diderot II_Pensees philosophiques


In [23]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Pensees philosophiques"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Pensees philosophiques"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Analyse de la philosophie,0,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,0,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,1,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,1,dHolbach,0,0__dHolbach_Le Christianisme
4,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
...,...,...,...,...,...
341,Recherches philosophiques,3,Diderot,9,9__Diderot_Recherches philosophiques
342,Methode naturelle,4,Jussieu,9,9__Jussieu_Methode naturelle
343,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
344,ecole 1,5,Raynal,9,9__Raynal_ecole 1


In [24]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [25]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

01/20/2025 12:20:36 [ruzicka:INFO] Fitting on 346 documents...
01/20/2025 12:20:36 [ruzicka:INFO] Predicting on 10 documents


Testing against Deleyre


01/20/2025 12:20:37 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.685 0.322 0.51  0.507 0.62  0.68  0.741 0.696 0.305 0.745]
Testing against dHolbach


01/20/2025 12:20:39 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.902 0.82  0.524 0.826 0.774 0.817 0.573 0.391 0.938 0.552]
Testing against Diderot II


01/20/2025 12:20:39 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.129 0.417 0.347 0.037 0.094 0.01  0.238 0.358 0.121 0.057]
Testing against Diderot


01/20/2025 12:20:40 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.91  0.934 0.994 0.972 0.928 0.821 0.932 0.879 0.881 0.885]
Testing against Jussieu


01/20/2025 12:20:41 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.003 0.005 0.011 0.012 0.001 0.071 0.009 0.012 0.034 0.017]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.182 0.328 0.351 0.3   0.711 0.723 0.503 0.8   0.261 0.822]


Checking results for the same 3 authors

In [26]:
# Diderot 
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot")] * problems_scaled_X.shape[0]
)

pens_vs_ddrt = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrt.to_csv('02_tests/bdi/2_l2_pens_vs_ddrt.csv')

01/20/2025 12:20:50 [ruzicka:INFO] Predicting on 10 documents


In [27]:
# Diderot II
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot II")] * problems_scaled_X.shape[0]
)

pens_vs_ddrtii = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrtii.to_csv('02_tests/bdi/2_l2_pens_vs_ddrt-ii.csv')

01/20/2025 12:20:55 [ruzicka:INFO] Predicting on 10 documents


In [28]:
# Raynal
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Raynal")] * problems_scaled_X.shape[0]
)

pens_vs_ray = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ray.to_csv('02_tests/bdi/2_l2_pens_vs_ray.csv')

# & Jussieu
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Jussieu")] * problems_scaled_X.shape[0]
)

pens_vs_j = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_j.to_csv('02_tests/bdi/2_l2_pens_vs_j.csv')

01/20/2025 12:21:06 [ruzicka:INFO] Predicting on 10 documents
01/20/2025 12:21:07 [ruzicka:INFO] Predicting on 10 documents


### Test *Salon 1781*

In [29]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Salon 1781"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Salon 1781"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
1,Salon 1781,2,Diderot II,1,1__Diderot II_Salon 1781
2,Salon 1781,2,Diderot II,2,2__Diderot II_Salon 1781
3,Salon 1781,2,Diderot II,3,3__Diderot II_Salon 1781
4,Salon 1781,2,Diderot II,4,4__Diderot II_Salon 1781
5,Salon 1781,2,Diderot II,5,5__Diderot II_Salon 1781
6,Salon 1781,2,Diderot II,6,6__Diderot II_Salon 1781
7,Salon 1781,2,Diderot II,7,7__Diderot II_Salon 1781
8,Salon 1781,2,Diderot II,8,8__Diderot II_Salon 1781
9,Salon 1781,2,Diderot II,9,9__Diderot II_Salon 1781


In [30]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Salon 1781"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Salon 1781"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Analyse de la philosophie,0,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,0,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,1,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,1,dHolbach,0,0__dHolbach_Le Christianisme
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
341,Recherches philosophiques,3,Diderot,9,9__Diderot_Recherches philosophiques
342,Methode naturelle,4,Jussieu,9,9__Jussieu_Methode naturelle
343,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
344,ecole 1,5,Raynal,9,9__Raynal_ecole 1


In [31]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [32]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

01/20/2025 12:21:24 [ruzicka:INFO] Fitting on 346 documents...
01/20/2025 12:21:24 [ruzicka:INFO] Predicting on 10 documents


Testing against Deleyre


01/20/2025 12:21:25 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.755 0.836 0.82  0.816 0.691 0.884 0.723 0.851 0.86  0.759]
Testing against dHolbach


01/20/2025 12:21:27 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.405 0.473 0.535 0.481 0.482 0.288 0.391 0.453 0.408 0.422]
Testing against Diderot II


01/20/2025 12:21:27 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.623 0.26  0.624 0.218 0.221 0.307 0.442 0.184 0.492 0.187]
Testing against Diderot


01/20/2025 12:21:28 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.688 0.862 0.698 0.571 0.737 0.638 0.801 0.646 0.73  0.537]
Testing against Jussieu


01/20/2025 12:21:29 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.329 0.066 0.217 0.416 0.466 0.273 0.38  0.598 0.303 0.745]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.785 0.752 0.64  0.798 0.829 0.822 0.723 0.735 0.718 0.838]


In [33]:
# Diderot 
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot")] * problems_scaled_X.shape[0]
)

pens_vs_ddrt = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrt.to_csv('02_tests/bdi/3_l2_salon1781_vs_ddrt.csv')

01/20/2025 12:21:38 [ruzicka:INFO] Predicting on 10 documents


In [34]:
# Diderot II
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot II")] * problems_scaled_X.shape[0]
)

pens_vs_ddrtii = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrtii.to_csv('02_tests/bdi/3_l2_salon1781_vs_ddrt-ii.csv')

01/20/2025 12:21:47 [ruzicka:INFO] Predicting on 10 documents


In [35]:
# Raynal 
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Raynal")] * problems_scaled_X.shape[0]
)

pens_vs_r = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_r.to_csv('02_tests/bdi/3_l2_salon1781_vs_ray.csv')

01/20/2025 12:21:48 [ruzicka:INFO] Predicting on 10 documents


In [36]:
# Jussieu
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Jussieu")] * problems_scaled_X.shape[0]
)

pens_vs_j = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_j.to_csv('02_tests/bdi/3_l2_salon1781_vs_j.csv')

01/20/2025 12:21:53 [ruzicka:INFO] Predicting on 10 documents


### Test Jussieu

Just taking a random non-Diderot work to see the results

In [39]:
corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,la,a,des,que,...,avant,ici,liberté,chez,fois,commerce,compagnie,voir,intérêt,prix
0,Analyse de la philosophie,Deleyre,0,0__Deleyre_Analyse de la philosophie,0.044484,0.048446,0.071805,0.067723,0.043527,0.045487,...,0.152499,0.000000,0.000000,0.089443,0.000000,0.000000,0.0,0.074536,0.109109,0.000000
1,eloge de M Roux,Deleyre,0,0__Deleyre_eloge de M Roux,0.050993,0.042856,0.043783,0.056436,0.040807,0.039422,...,0.000000,0.083333,0.000000,0.089443,0.000000,0.294884,0.0,0.074536,0.000000,0.000000
2,Le bon sens,dHolbach,0,0__dHolbach_Le bon sens,0.039059,0.067079,0.028021,0.011287,0.100656,0.075812,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.074536,0.109109,0.000000
3,Le Christianisme,dHolbach,0,0__dHolbach_Le Christianisme,0.043399,0.063352,0.043783,0.028218,0.081613,0.069747,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.074536,0.000000,0.000000
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,0.047738,0.052172,0.056043,0.045149,0.024484,0.057617,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.074536,0.109109,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,Recherches philosophiques,Diderot,9,9__Diderot_Recherches philosophiques,0.040144,0.074532,0.031524,0.050792,0.029925,0.084910,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.074536,0.000000,0.000000
352,Methode naturelle,Jussieu,9,9__Jussieu_Methode naturelle,0.036889,0.055899,0.064799,0.022574,0.070731,0.042455,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.074536,0.000000,0.000000
353,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,0.053163,0.037266,0.035027,0.011287,0.065290,0.033357,...,0.000000,0.000000,0.000000,0.000000,0.068519,0.000000,0.0,0.000000,0.000000,0.000000
354,ecole 1,Raynal,9,9__Raynal_ecole 1,0.053163,0.059626,0.050789,0.079010,0.019043,0.036390,...,0.000000,0.083333,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [40]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Notice Historique 1-6"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Notice Historique 1-6"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Notice Historique 1-6,4,Jussieu,0,0__Jussieu_Notice Historique 1-6
1,Notice Historique 1-6,4,Jussieu,1,1__Jussieu_Notice Historique 1-6
2,Notice Historique 1-6,4,Jussieu,10,10__Jussieu_Notice Historique 1-6
3,Notice Historique 1-6,4,Jussieu,11,11__Jussieu_Notice Historique 1-6
4,Notice Historique 1-6,4,Jussieu,12,12__Jussieu_Notice Historique 1-6
5,Notice Historique 1-6,4,Jussieu,13,13__Jussieu_Notice Historique 1-6
6,Notice Historique 1-6,4,Jussieu,14,14__Jussieu_Notice Historique 1-6
7,Notice Historique 1-6,4,Jussieu,15,15__Jussieu_Notice Historique 1-6
8,Notice Historique 1-6,4,Jussieu,16,16__Jussieu_Notice Historique 1-6
9,Notice Historique 1-6,4,Jussieu,17,17__Jussieu_Notice Historique 1-6


In [41]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Notice Historique 1-6"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Notice Historique 1-6"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Analyse de la philosophie,0,Deleyre,0,0__Deleyre_Analyse de la philosophie
1,eloge de M Roux,0,Deleyre,0,0__Deleyre_eloge de M Roux
2,Le bon sens,1,dHolbach,0,0__dHolbach_Le bon sens
3,Le Christianisme,1,dHolbach,0,0__dHolbach_Le Christianisme
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
325,Pensees sur l-interpretation,3,Diderot,9,9__Diderot_Pensees sur l-interpretation
326,Recherches philosophiques,3,Diderot,9,9__Diderot_Recherches philosophiques
327,Methode naturelle,4,Jussieu,9,9__Jussieu_Methode naturelle
328,ecole 1,5,Raynal,9,9__Raynal_ecole 1


In [42]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [43]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    )

01/20/2025 12:22:53 [ruzicka:INFO] Fitting on 330 documents...
01/20/2025 12:22:53 [ruzicka:INFO] Predicting on 26 documents


Testing against Deleyre


01/20/2025 12:22:56 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.844 0.874 0.767 0.655 0.726 0.655 0.664 0.84  0.729 0.837 0.766 0.653
 0.814 0.701 0.803 0.88  0.738 0.834 0.86  0.863 0.838 0.709 0.574 0.691
 0.61  0.67 ]
Testing against dHolbach


01/20/2025 12:23:00 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.387 0.334 0.038 0.081 0.058 0.09  0.058 0.126 0.2   0.323 0.215 0.068
 0.338 0.187 0.08  0.357 0.091 0.214 0.354 0.064 0.083 0.029 0.067 0.056
 0.065 0.19 ]
Testing against Diderot II


01/20/2025 12:23:02 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.083 0.061 0.121 0.024 0.037 0.057 0.065 0.02  0.041 0.143 0.101 0.189
 0.105 0.063 0.032 0.382 0.082 0.189 0.027 0.114 0.058 0.007 0.049 0.016
 0.024 0.012]
Testing against Diderot


01/20/2025 12:23:05 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.46  0.254 0.188 0.113 0.149 0.105 0.083 0.493 0.241 0.124 0.297 0.214
 0.446 0.197 0.164 0.212 0.175 0.356 0.222 0.158 0.259 0.098 0.08  0.044
 0.152 0.076]
Testing against Jussieu


01/20/2025 12:23:07 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.702 0.463 0.382 0.552 0.097 0.371 0.283 0.563 0.884 0.663 0.837 0.329
 0.196 0.774 0.602 0.714 0.487 0.375 0.626 0.61  0.136 0.499 0.211 0.252
 0.536 0.229]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.854 0.902 0.968 0.976 0.995 0.985 0.993 0.937 0.853 0.928 0.877 0.984
 0.936 0.939 0.967 0.668 0.937 0.925 0.881 0.926 0.975 0.978 0.995 0.996
 0.986 0.992]


In [44]:
authors = label_uniques.values

fh = ''

for a in authors:

    print(a)
    
    bdi_mm.predict_proba(
        problems_scaled_X, [label_uniques.get_loc(a)] * problems_scaled_X.shape[0]
    )

    
    x = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))
    
    fh = '02_tests/bdi/5_l2_jussieu_vs_' + a + '.csv'

    x.to_csv(fh)

01/20/2025 12:23:18 [ruzicka:INFO] Predicting on 26 documents


Deleyre


01/20/2025 12:23:21 [ruzicka:INFO] Predicting on 26 documents


dHolbach


01/20/2025 12:23:25 [ruzicka:INFO] Predicting on 26 documents


Diderot II


01/20/2025 12:23:27 [ruzicka:INFO] Predicting on 26 documents


Diderot


01/20/2025 12:23:30 [ruzicka:INFO] Predicting on 26 documents


Jussieu


01/20/2025 12:23:32 [ruzicka:INFO] Predicting on 26 documents


Raynal
