### Test ground truth attribution
Diderot vs Diderot from *Correspondences littéraires*
  
Settings:
- smaller corpus (each author has only 2 works, only 5 authors are candidates);
- test l1 and l2 normalisation.

In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

import re

from sklearn.preprocessing import StandardScaler
# from sklearn.feature_extraction.text import TfidfVectorizer

import logging

logging.basicConfig(level="INFO")
logger = logging.getLogger("ruzicka")

from ruzicka.BDIVerifier import BDIVerifier

In [41]:
# load preprocessed data

corpus = pd.read_csv('0_tests_diderot-ii/test_diderotII_rfreq.csv') # rel freq
# corpus = pd.read_csv('0_tests_diderot-ii/test_diderotII_l2norm.csv') # l2 norm

corpus = corpus.iloc[:, 1:] # remove first column (should have not been saved with it!)
corpus = corpus.drop('author_label', axis=1) # remove author_label col as i'll do it again

corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,l,la,des,et,...,ci,avant,cependant,seul,moyens,force,yeux,choses,ici,art
0,Essai sur la vie,Deleyre,0,0__Deleyre_Essai sur la vie,4.8,2.4,5.4,1.6,0.8,1.8,...,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Idees sur l-education,Deleyre,0,0__Deleyre_Idees sur l-education,3.8,3.6,2.4,2.4,2.3,2.9,...,0.2,0.3,0.0,0.0,0.0,0.0,0.0,0.2,0.1,0.0
2,ethocratie,dHolbach,0,0__dHolbach_ethocratie,5.8,1.8,1.5,3.3,3.5,3.1,...,0.1,0.0,0.0,0.0,0.0,0.2,0.0,0.1,0.0,0.0
3,Système Social 3,dHolbach,0,0__dHolbach_Système Social 3,3.2,5.0,3.0,4.0,3.9,3.0,...,0.1,0.0,0.1,0.0,0.0,0.1,0.2,0.0,0.0,0.0
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,4.4,2.8,2.6,3.2,0.9,1.6,...,0.1,0.0,0.2,0.0,0.0,0.0,0.2,0.1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,Suite de l-apologie,Diderot,9,9__Diderot_Suite de l-apologie,6.4,2.1,3.1,2.7,1.6,2.2,...,0.0,0.1,0.1,0.3,0.0,0.0,0.0,0.0,0.1,0.0
402,HDI T1-4,Jussieu,9,9__Jussieu_HDI T1-4,4.4,2.7,2.0,3.4,1.9,2.1,...,0.1,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0
403,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,4.9,2.0,1.4,2.0,2.4,2.4,...,0.0,0.0,0.2,0.0,0.1,0.1,0.0,0.0,0.0,0.0
404,Anecdotes litteraires3 1756,Raynal,9,9__Raynal_Anecdotes litteraires3 1756,4.0,1.3,1.1,1.5,0.7,2.3,...,0.1,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0


Splitting data

In [71]:
raw_df = corpus.iloc[:, :4] # leaves metadata
X = corpus.iloc[:, 4:] # leaves only word columns

raw_df

Unnamed: 0,work,author,chunk_num,tag
0,Essai sur la vie,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,dHolbach,0,0__dHolbach_Système Social 3
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...
401,Suite de l-apologie,Diderot,9,9__Diderot_Suite de l-apologie
402,HDI T1-4,Jussieu,9,9__Jussieu_HDI T1-4
403,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6
404,Anecdotes litteraires3 1756,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [72]:
labels, label_uniques = raw_df.author.factorize()
raw_df.insert(1, "author_label", labels)
raw_df

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
401,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
402,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
403,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
404,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [73]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
1,Pensees philosophiques,2,Diderot II,1,1__Diderot II_Pensees philosophiques
2,Pensees philosophiques,2,Diderot II,2,2__Diderot II_Pensees philosophiques
3,Pensees philosophiques,2,Diderot II,3,3__Diderot II_Pensees philosophiques
4,Pensees philosophiques,2,Diderot II,4,4__Diderot II_Pensees philosophiques
5,Pensees philosophiques,2,Diderot II,5,5__Diderot II_Pensees philosophiques
6,Pensees philosophiques,2,Diderot II,6,6__Diderot II_Pensees philosophiques
7,Pensees philosophiques,2,Diderot II,7,7__Diderot II_Pensees philosophiques
8,Pensees philosophiques,2,Diderot II,8,8__Diderot II_Pensees philosophiques
9,Pensees philosophiques,2,Diderot II,9,9__Diderot II_Pensees philosophiques


In [74]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Pensees philosophiques"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Pensees philosophiques"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
4,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
...,...,...,...,...,...
391,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
392,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
393,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
394,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


#### Scaling

In [75]:
ss = StandardScaler(with_mean=False).fit(rest_X)

In [76]:
rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

#### Verification

In [77]:
rng = np.random.default_rng(42)

In [78]:
bdi_mm = BDIVerifier(
    metric="minmax", nb_bootstrap_iter=1000, rnd_prop=0.35, random_state=rng
)

In [15]:
help(BDIVerifier.predict_proba)

Help on function predict_proba in module ruzicka.BDIVerifier:

predict_proba(self, test_X: Collection[Collection[float]], test_y: Collection[int], nb_imposters: int = 30) -> numpy.ndarray[typing.Any, numpy.dtype[numpy.float64]]
    Given a `test_vector` and an integer representing a target authors
    (`target_int`), we retrieve the distance to the nearest document in the
    training data, which is NOT authored by the target author. In the
    distance calculation, we only take into account the feature values
    specified in `rnd_feature_idxs` (if the latter parameter is specified);
    else, we use the entire feature space. Note that we each time sample a
    random number of imposters from the available training documents, the
    number of which is specified by `nb_imposters`.
    
    We apply the normal verification method, using self.nb_bootstrap_iter
    iterations. In this case, the returned probabilities represent the
    proportions of bootstraps in which the target_author 

In [79]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

12/16/2024 06:09:31 [ruzicka:INFO] Fitting on 396 documents...


In [80]:
label_uniques.values

array(['Deleyre', 'dHolbach', 'Diderot II', 'Diderot', 'Jussieu',
       'Raynal'], dtype=object)

In [81]:
for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

12/16/2024 06:09:36 [ruzicka:INFO] Predicting on 10 documents


Testing against Deleyre


12/16/2024 06:09:37 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.361 0.284 0.327 0.208 0.51  0.475 0.493 0.565 0.307 0.307]
Testing against dHolbach


12/16/2024 06:09:39 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.586 0.595 0.229 0.348 0.34  0.771 0.466 0.334 0.742 0.16 ]
Testing against Diderot II


12/16/2024 06:09:40 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.153 0.384 0.302 0.071 0.093 0.035 0.149 0.181 0.13  0.036]
Testing against Diderot


12/16/2024 06:09:41 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.989 0.98  0.99  0.997 0.942 0.936 0.963 0.9   0.983 0.995]
Testing against Jussieu


12/16/2024 06:09:42 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.048 0.014 0.032 0.074 0.017 0.224 0.16  0.066 0.098 0.361]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.276 0.532 0.689 0.536 0.862 0.588 0.692 0.889 0.433 0.712]


Re-run against Diderot, Diderot-II, & Raynal

In [82]:
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot")] * problems_scaled_X.shape[0]
)

12/16/2024 06:10:01 [ruzicka:INFO] Predicting on 10 documents


array([0.989, 0.981, 0.99 , 0.994, 0.943, 0.927, 0.974, 0.904, 0.974,
       0.993])

In [83]:
pens_vs_ddrt = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrt.to_csv('0_tests_diderot-ii/1_rf_pens_vs_ddrt.csv')

pens_vs_ddrt

Unnamed: 0,0__Diderot II_Pensees philosophiques,1__Diderot II_Pensees philosophiques,2__Diderot II_Pensees philosophiques,3__Diderot II_Pensees philosophiques,4__Diderot II_Pensees philosophiques,5__Diderot II_Pensees philosophiques,6__Diderot II_Pensees philosophiques,7__Diderot II_Pensees philosophiques,8__Diderot II_Pensees philosophiques,9__Diderot II_Pensees philosophiques
0,0.176031,0.079833,0.086781,0.142825,0.200783,0.082163,0.161605,0.014793,0.012429,0.179710
1,0.144676,0.142688,0.037926,0.162325,0.138075,0.066415,0.026383,0.064616,0.110330,0.115859
2,0.172491,0.114388,0.060181,0.168798,0.148040,0.103734,0.145098,0.094631,0.127997,0.130568
3,0.083917,0.136234,0.084063,0.171127,0.055082,0.090331,0.084955,0.089232,0.123835,0.115887
4,0.055789,0.116885,0.156248,0.137825,0.106549,0.040542,0.074515,-0.028164,0.081764,0.093374
...,...,...,...,...,...,...,...,...,...,...
995,0.220170,0.148171,0.175416,0.152768,-0.030398,-0.007831,0.101416,-0.027291,0.087006,0.106876
996,0.151474,0.106403,0.157370,0.141459,0.103429,0.087425,0.044055,0.054125,0.071923,0.038623
997,0.100969,0.087014,0.151327,0.061916,0.148697,0.061370,0.113449,0.095701,0.104517,0.194715
998,0.036419,0.190087,0.088829,0.205393,0.088178,0.078657,0.041391,0.008547,0.080873,0.194041


In [84]:
# Diderot II
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot II")] * problems_scaled_X.shape[0]
)

pens_vs_ddrtii = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrtii.to_csv('0_tests_diderot-ii/1_rf_pens_vs_ddrt-ii.csv')

pens_vs_ddrtii

12/16/2024 06:10:10 [ruzicka:INFO] Predicting on 10 documents


Unnamed: 0,0__Diderot II_Pensees philosophiques,1__Diderot II_Pensees philosophiques,2__Diderot II_Pensees philosophiques,3__Diderot II_Pensees philosophiques,4__Diderot II_Pensees philosophiques,5__Diderot II_Pensees philosophiques,6__Diderot II_Pensees philosophiques,7__Diderot II_Pensees philosophiques,8__Diderot II_Pensees philosophiques,9__Diderot II_Pensees philosophiques
0,-0.041778,0.044318,-0.019865,-0.127726,-0.130775,-0.171792,-0.029117,0.096569,-0.021271,-0.060670
1,-0.094351,-0.070336,0.077645,-0.104109,-0.071910,-0.018171,-0.095978,-0.023928,-0.113856,0.004995
2,-0.061279,-0.034494,-0.078740,-0.147127,-0.116398,-0.136184,0.014925,0.028433,-0.017653,-0.177398
3,-0.012140,-0.018637,-0.139486,-0.057188,0.025382,-0.159447,-0.027808,0.054001,-0.088843,-0.111701
4,-0.002145,0.102275,0.028700,-0.023034,-0.083488,-0.121033,-0.084264,-0.119560,-0.164960,-0.106882
...,...,...,...,...,...,...,...,...,...,...
995,-0.102357,-0.050527,0.010517,-0.147588,-0.072266,-0.189369,0.009624,-0.076600,-0.040118,-0.060256
996,-0.055284,-0.023909,-0.103488,-0.139069,-0.104428,-0.049809,-0.175323,-0.034718,-0.029439,-0.143130
997,-0.138326,-0.052331,-0.114594,-0.079291,-0.048654,-0.068808,-0.171575,0.047515,-0.100862,-0.085131
998,-0.119213,0.004042,-0.071628,-0.108847,-0.098582,-0.054363,-0.084060,-0.152792,-0.028111,-0.072903


In [85]:
# Raynal
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Raynal")] * problems_scaled_X.shape[0]
)

pens_vs_ray = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ray.to_csv('0_tests_diderot-ii/1_rf_pens_vs_ray.csv')

pens_vs_ray

12/16/2024 06:10:14 [ruzicka:INFO] Predicting on 10 documents


Unnamed: 0,0__Diderot II_Pensees philosophiques,1__Diderot II_Pensees philosophiques,2__Diderot II_Pensees philosophiques,3__Diderot II_Pensees philosophiques,4__Diderot II_Pensees philosophiques,5__Diderot II_Pensees philosophiques,6__Diderot II_Pensees philosophiques,7__Diderot II_Pensees philosophiques,8__Diderot II_Pensees philosophiques,9__Diderot II_Pensees philosophiques
0,-0.060965,-0.011993,0.026379,0.034009,-0.007243,0.038565,0.003625,0.135019,-0.009113,-0.100364
1,-0.222394,0.016143,0.000671,0.040941,0.067643,-0.001233,-0.019654,-0.025519,-0.012617,0.122966
2,0.015662,-0.009421,-0.022081,0.026732,-0.043238,0.033862,-0.006342,0.075511,0.102785,0.065971
3,-0.079522,0.008922,0.112741,0.073354,0.102524,-0.072085,0.116141,-0.041567,0.072767,0.006515
4,-0.018775,0.001531,-0.071741,-0.107470,0.021468,0.030279,0.051794,0.126718,0.056371,0.008925
...,...,...,...,...,...,...,...,...,...,...
995,-0.090760,-0.049140,0.106019,0.048090,0.047755,0.033843,0.083621,0.079109,-0.005029,-0.014032
996,-0.068066,0.039911,0.100223,0.012408,0.103569,-0.044462,0.047800,0.064336,0.002585,0.027972
997,-0.051842,-0.083497,0.018901,-0.044573,-0.009085,0.134867,0.154423,-0.061107,-0.037383,0.012058
998,-0.032391,0.064401,0.009539,-0.052776,-0.013927,-0.048836,0.049136,0.051434,0.025962,0.107082


## L2 norm

Same tests but with another freq normalisation

In [110]:
# load preprocessed data

# corpus = pd.read_csv('0_tests_diderot-ii/test_diderotII_rfreq.csv') # rel freq
corpus = pd.read_csv('0_tests_diderot-ii/test_diderotII_l2norm.csv') # l2 norm

corpus = corpus.iloc[:, 1:] # remove first column (should have not been saved with it!)
corpus = corpus.drop('author_label', axis=1) # remove author_label col as i'll do it again

corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,l,la,des,et,...,ci,avant,cependant,seul,moyens,force,yeux,choses,ici,art
0,Essai sur la vie,Deleyre,0,0__Deleyre_Essai sur la vie,0.048316,0.045141,0.120999,0.027249,0.018934,0.033321,...,0.065653,0.072169,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,Idees sur l-education,Deleyre,0,0__Deleyre_Idees sur l-education,0.038250,0.067712,0.053777,0.040874,0.054434,0.053684,...,0.131306,0.216506,0.000000,0.000000,0.000000,0.000000,0.000000,0.153846,0.095346,0.0
2,ethocratie,dHolbach,0,0__dHolbach_ethocratie,0.058382,0.033856,0.033611,0.056201,0.082835,0.057386,...,0.065653,0.000000,0.000000,0.000000,0.000000,0.130189,0.000000,0.076923,0.000000,0.0
3,Système Social 3,dHolbach,0,0__dHolbach_Système Social 3,0.032211,0.094044,0.067222,0.068123,0.092302,0.055535,...,0.065653,0.000000,0.074329,0.000000,0.000000,0.065094,0.118888,0.000000,0.000000,0.0
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,0.044290,0.052665,0.058259,0.054498,0.021300,0.029619,...,0.065653,0.000000,0.148659,0.000000,0.000000,0.000000,0.118888,0.076923,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,Suite de l-apologie,Diderot,9,9__Diderot_Suite de l-apologie,0.064421,0.039498,0.069463,0.045983,0.037867,0.040726,...,0.000000,0.072169,0.074329,0.208514,0.000000,0.000000,0.000000,0.000000,0.095346,0.0
402,HDI T1-4,Jussieu,9,9__Jussieu_HDI T1-4,0.044290,0.050784,0.044815,0.057904,0.044967,0.038874,...,0.065653,0.000000,0.000000,0.000000,0.085126,0.065094,0.000000,0.000000,0.000000,0.0
403,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,0.049323,0.037618,0.031370,0.034061,0.056801,0.044428,...,0.000000,0.000000,0.148659,0.000000,0.085126,0.065094,0.000000,0.000000,0.000000,0.0
404,Anecdotes litteraires3 1756,Raynal,9,9__Raynal_Anecdotes litteraires3 1756,0.040263,0.024451,0.024648,0.025546,0.016567,0.042577,...,0.065653,0.000000,0.000000,0.000000,0.000000,0.000000,0.118888,0.000000,0.000000,0.0


In [111]:
raw_df = corpus.iloc[:, :4] # leaves metadata
X = corpus.iloc[:, 4:] # leaves only word columns

raw_df

Unnamed: 0,work,author,chunk_num,tag
0,Essai sur la vie,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,dHolbach,0,0__dHolbach_Système Social 3
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...
401,Suite de l-apologie,Diderot,9,9__Diderot_Suite de l-apologie
402,HDI T1-4,Jussieu,9,9__Jussieu_HDI T1-4
403,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6
404,Anecdotes litteraires3 1756,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [112]:
labels, label_uniques = raw_df.author.factorize()
raw_df.insert(1, "author_label", labels)
raw_df

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
401,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
402,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
403,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
404,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [113]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Pensees philosophiques"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
1,Pensees philosophiques,2,Diderot II,1,1__Diderot II_Pensees philosophiques
2,Pensees philosophiques,2,Diderot II,2,2__Diderot II_Pensees philosophiques
3,Pensees philosophiques,2,Diderot II,3,3__Diderot II_Pensees philosophiques
4,Pensees philosophiques,2,Diderot II,4,4__Diderot II_Pensees philosophiques
5,Pensees philosophiques,2,Diderot II,5,5__Diderot II_Pensees philosophiques
6,Pensees philosophiques,2,Diderot II,6,6__Diderot II_Pensees philosophiques
7,Pensees philosophiques,2,Diderot II,7,7__Diderot II_Pensees philosophiques
8,Pensees philosophiques,2,Diderot II,8,8__Diderot II_Pensees philosophiques
9,Pensees philosophiques,2,Diderot II,9,9__Diderot II_Pensees philosophiques


In [114]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Pensees philosophiques"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Pensees philosophiques"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
4,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
...,...,...,...,...,...
391,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
392,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
393,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
394,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [115]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [116]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

12/16/2024 06:47:41 [ruzicka:INFO] Fitting on 396 documents...
12/16/2024 06:47:41 [ruzicka:INFO] Predicting on 10 documents


Testing against Deleyre


12/16/2024 06:47:42 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.39  0.277 0.307 0.227 0.51  0.488 0.542 0.571 0.293 0.288]
Testing against dHolbach


12/16/2024 06:47:44 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.603 0.606 0.23  0.373 0.37  0.773 0.42  0.327 0.752 0.153]
Testing against Diderot II


12/16/2024 06:47:45 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.174 0.377 0.264 0.072 0.075 0.032 0.153 0.185 0.136 0.035]
Testing against Diderot


12/16/2024 06:47:46 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.989 0.969 0.99  0.996 0.94  0.93  0.978 0.89  0.983 0.995]
Testing against Jussieu


12/16/2024 06:47:47 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.05  0.016 0.054 0.075 0.018 0.222 0.141 0.055 0.106 0.326]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.26  0.523 0.706 0.532 0.86  0.566 0.671 0.891 0.442 0.717]


Checking results for the same 3 authors

In [117]:
# Diderot 
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot")] * problems_scaled_X.shape[0]
)

pens_vs_ddrt = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrt.to_csv('0_tests_diderot-ii/2_l2_pens_vs_ddrt.csv')

12/16/2024 06:48:35 [ruzicka:INFO] Predicting on 10 documents


In [118]:
# Diderot II
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot II")] * problems_scaled_X.shape[0]
)

pens_vs_ddrtii = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrtii.to_csv('0_tests_diderot-ii/2_l2_pens_vs_ddrt-ii.csv')

12/16/2024 06:48:38 [ruzicka:INFO] Predicting on 10 documents


In [119]:
# Raynal
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Raynal")] * problems_scaled_X.shape[0]
)

pens_vs_ray = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ray.to_csv('0_tests_diderot-ii/2_l2_pens_vs_ray.csv')

# & Jussieu
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Jussieu")] * problems_scaled_X.shape[0]
)

pens_vs_j = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_j.to_csv('0_tests_diderot-ii/2_l2_pens_vs_j.csv')

12/16/2024 06:48:40 [ruzicka:INFO] Predicting on 10 documents
12/16/2024 06:48:41 [ruzicka:INFO] Predicting on 10 documents


### Test *Salon 1781*

In [97]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Salon 1781"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Salon 1781"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
1,Salon 1781,2,Diderot II,1,1__Diderot II_Salon 1781
2,Salon 1781,2,Diderot II,2,2__Diderot II_Salon 1781
3,Salon 1781,2,Diderot II,3,3__Diderot II_Salon 1781
4,Salon 1781,2,Diderot II,4,4__Diderot II_Salon 1781
5,Salon 1781,2,Diderot II,5,5__Diderot II_Salon 1781
6,Salon 1781,2,Diderot II,6,6__Diderot II_Salon 1781
7,Salon 1781,2,Diderot II,7,7__Diderot II_Salon 1781
8,Salon 1781,2,Diderot II,8,8__Diderot II_Salon 1781
9,Salon 1781,2,Diderot II,9,9__Diderot II_Salon 1781


In [98]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Salon 1781"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Salon 1781"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
391,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
392,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
393,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
394,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [99]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [100]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

12/16/2024 06:29:05 [ruzicka:INFO] Fitting on 396 documents...
12/16/2024 06:29:05 [ruzicka:INFO] Predicting on 10 documents


Testing against Deleyre


12/16/2024 06:29:07 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.48  0.612 0.533 0.589 0.539 0.57  0.356 0.505 0.568 0.515]
Testing against dHolbach


12/16/2024 06:29:08 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.27  0.357 0.419 0.615 0.585 0.441 0.685 0.486 0.279 0.577]
Testing against Diderot II


12/16/2024 06:29:09 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.518 0.266 0.529 0.216 0.189 0.298 0.179 0.064 0.46  0.069]
Testing against Diderot


12/16/2024 06:29:10 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.909 0.835 0.884 0.648 0.835 0.888 0.845 0.8   0.906 0.635]
Testing against Jussieu


12/16/2024 06:29:11 [ruzicka:INFO] Predicting on 10 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.642 0.569 0.476 0.759 0.705 0.636 0.755 0.771 0.547 0.813]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.784 0.833 0.73  0.766 0.725 0.764 0.643 0.842 0.805 0.828]


In [104]:
# Diderot 
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot")] * problems_scaled_X.shape[0]
)

pens_vs_ddrt = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrt.to_csv('0_tests_diderot-ii/3_l2_salon1781_vs_ddrt.csv')

12/16/2024 06:33:13 [ruzicka:INFO] Predicting on 10 documents


In [105]:
# Diderot II
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Diderot II")] * problems_scaled_X.shape[0]
)

pens_vs_ddrtii = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_ddrtii.to_csv('0_tests_diderot-ii/3_l2_salon1781_vs_ddrt-ii.csv')

12/16/2024 06:33:15 [ruzicka:INFO] Predicting on 10 documents


In [108]:
# Raynal 
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Raynal")] * problems_scaled_X.shape[0]
)

pens_vs_r = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_r.to_csv('0_tests_diderot-ii/3_l2_salon1781_vs_ray.csv')

12/16/2024 06:36:14 [ruzicka:INFO] Predicting on 10 documents


In [109]:
# Jussieu
bdi_mm.predict_proba(
    problems_scaled_X, [label_uniques.get_loc("Jussieu")] * problems_scaled_X.shape[0]
)

pens_vs_j = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))

pens_vs_j.to_csv('0_tests_diderot-ii/3_l2_salon1781_vs_j.csv')

12/16/2024 06:36:27 [ruzicka:INFO] Predicting on 10 documents


### Test Deleyre *Idees sur l-education*
(for no particular reason, just also a small work (16 chunks)

In [157]:
corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,l,la,des,et,...,ci,avant,cependant,seul,moyens,force,yeux,choses,ici,art
0,Essai sur la vie,Deleyre,0,0__Deleyre_Essai sur la vie,0.048316,0.045141,0.120999,0.027249,0.018934,0.033321,...,0.065653,0.072169,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,Idees sur l-education,Deleyre,0,0__Deleyre_Idees sur l-education,0.038250,0.067712,0.053777,0.040874,0.054434,0.053684,...,0.131306,0.216506,0.000000,0.000000,0.000000,0.000000,0.000000,0.153846,0.095346,0.0
2,ethocratie,dHolbach,0,0__dHolbach_ethocratie,0.058382,0.033856,0.033611,0.056201,0.082835,0.057386,...,0.065653,0.000000,0.000000,0.000000,0.000000,0.130189,0.000000,0.076923,0.000000,0.0
3,Système Social 3,dHolbach,0,0__dHolbach_Système Social 3,0.032211,0.094044,0.067222,0.068123,0.092302,0.055535,...,0.065653,0.000000,0.074329,0.000000,0.000000,0.065094,0.118888,0.000000,0.000000,0.0
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,0.044290,0.052665,0.058259,0.054498,0.021300,0.029619,...,0.065653,0.000000,0.148659,0.000000,0.000000,0.000000,0.118888,0.076923,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,Suite de l-apologie,Diderot,9,9__Diderot_Suite de l-apologie,0.064421,0.039498,0.069463,0.045983,0.037867,0.040726,...,0.000000,0.072169,0.074329,0.208514,0.000000,0.000000,0.000000,0.000000,0.095346,0.0
402,HDI T1-4,Jussieu,9,9__Jussieu_HDI T1-4,0.044290,0.050784,0.044815,0.057904,0.044967,0.038874,...,0.065653,0.000000,0.000000,0.000000,0.085126,0.065094,0.000000,0.000000,0.000000,0.0
403,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,0.049323,0.037618,0.031370,0.034061,0.056801,0.044428,...,0.000000,0.000000,0.148659,0.000000,0.085126,0.065094,0.000000,0.000000,0.000000,0.0
404,Anecdotes litteraires3 1756,Raynal,9,9__Raynal_Anecdotes litteraires3 1756,0.040263,0.024451,0.024648,0.025546,0.016567,0.042577,...,0.065653,0.000000,0.000000,0.000000,0.000000,0.000000,0.118888,0.000000,0.000000,0.0


In [158]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Idees sur l-education"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Idees sur l-education"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
1,Idees sur l-education,0,Deleyre,1,1__Deleyre_Idees sur l-education
2,Idees sur l-education,0,Deleyre,10,10__Deleyre_Idees sur l-education
3,Idees sur l-education,0,Deleyre,11,11__Deleyre_Idees sur l-education
4,Idees sur l-education,0,Deleyre,12,12__Deleyre_Idees sur l-education
5,Idees sur l-education,0,Deleyre,13,13__Deleyre_Idees sur l-education
6,Idees sur l-education,0,Deleyre,14,14__Deleyre_Idees sur l-education
7,Idees sur l-education,0,Deleyre,15,15__Deleyre_Idees sur l-education
8,Idees sur l-education,0,Deleyre,2,2__Deleyre_Idees sur l-education
9,Idees sur l-education,0,Deleyre,3,3__Deleyre_Idees sur l-education


In [159]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Idees sur l-education"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Idees sur l-education"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
2,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
3,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
4,Salon 1781,2,Diderot II,0,0__Diderot II_Salon 1781
...,...,...,...,...,...
385,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
386,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
387,Notice Historique 1-6,4,Jussieu,9,9__Jussieu_Notice Historique 1-6
388,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [160]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [161]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    ) 

12/16/2024 07:22:32 [ruzicka:INFO] Fitting on 390 documents...
12/16/2024 07:22:32 [ruzicka:INFO] Predicting on 16 documents


Testing against Deleyre


12/16/2024 07:22:34 [ruzicka:INFO] Predicting on 16 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.697 0.615 0.652 0.785 0.593 0.746 0.76  0.556 0.502 0.508 0.605 0.316
 0.582 0.856 0.438 0.935]
Testing against dHolbach


12/16/2024 07:22:36 [ruzicka:INFO] Predicting on 16 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.825 0.816 0.925 0.476 0.675 0.79  0.748 0.892 0.952 0.631 0.772 0.927
 0.96  0.674 0.699 0.63 ]
Testing against Diderot II


12/16/2024 07:22:37 [ruzicka:INFO] Predicting on 16 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.113 0.161 0.092 0.033 0.038 0.146 0.098 0.044 0.116 0.055 0.053 0.076
 0.05  0.145 0.062 0.121]
Testing against Diderot


12/16/2024 07:22:39 [ruzicka:INFO] Predicting on 16 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.904 0.543 0.325 0.35  0.236 0.389 0.315 0.634 0.574 0.221 0.415 0.699
 0.61  0.624 0.358 0.425]
Testing against Jussieu


12/16/2024 07:22:41 [ruzicka:INFO] Predicting on 16 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.61  0.275 0.579 0.912 0.865 0.55  0.709 0.571 0.141 0.933 0.825 0.682
 0.302 0.747 0.94  0.575]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.444 0.832 0.589 0.664 0.831 0.833 0.779 0.647 0.571 0.782 0.748 0.518
 0.372 0.571 0.67  0.531]


Results against best authors

In [162]:
authors = ["Deleyre", "dHolbach", "Diderot", "Raynal"]

fh = ''

for a in authors:

    print(a)
    
    bdi_mm.predict_proba(
        problems_scaled_X, [label_uniques.get_loc(a)] * problems_scaled_X.shape[0]
    )

    
    x = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))
    
    fh = '0_tests_diderot-ii/4_l2_deleyre_vs_' + a + '.csv'

    x.to_csv(fh)

12/16/2024 07:22:43 [ruzicka:INFO] Predicting on 16 documents


Deleyre


12/16/2024 07:22:45 [ruzicka:INFO] Predicting on 16 documents


dHolbach


12/16/2024 07:22:48 [ruzicka:INFO] Predicting on 16 documents


Diderot


12/16/2024 07:22:49 [ruzicka:INFO] Predicting on 16 documents


Raynal


### Test Jussieu

In [163]:
corpus

Unnamed: 0,work,author,chunk_num,tag,de,les,l,la,des,et,...,ci,avant,cependant,seul,moyens,force,yeux,choses,ici,art
0,Essai sur la vie,Deleyre,0,0__Deleyre_Essai sur la vie,0.048316,0.045141,0.120999,0.027249,0.018934,0.033321,...,0.065653,0.072169,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,Idees sur l-education,Deleyre,0,0__Deleyre_Idees sur l-education,0.038250,0.067712,0.053777,0.040874,0.054434,0.053684,...,0.131306,0.216506,0.000000,0.000000,0.000000,0.000000,0.000000,0.153846,0.095346,0.0
2,ethocratie,dHolbach,0,0__dHolbach_ethocratie,0.058382,0.033856,0.033611,0.056201,0.082835,0.057386,...,0.065653,0.000000,0.000000,0.000000,0.000000,0.130189,0.000000,0.076923,0.000000,0.0
3,Système Social 3,dHolbach,0,0__dHolbach_Système Social 3,0.032211,0.094044,0.067222,0.068123,0.092302,0.055535,...,0.065653,0.000000,0.074329,0.000000,0.000000,0.065094,0.118888,0.000000,0.000000,0.0
4,Pensees philosophiques,Diderot II,0,0__Diderot II_Pensees philosophiques,0.044290,0.052665,0.058259,0.054498,0.021300,0.029619,...,0.065653,0.000000,0.148659,0.000000,0.000000,0.000000,0.118888,0.076923,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,Suite de l-apologie,Diderot,9,9__Diderot_Suite de l-apologie,0.064421,0.039498,0.069463,0.045983,0.037867,0.040726,...,0.000000,0.072169,0.074329,0.208514,0.000000,0.000000,0.000000,0.000000,0.095346,0.0
402,HDI T1-4,Jussieu,9,9__Jussieu_HDI T1-4,0.044290,0.050784,0.044815,0.057904,0.044967,0.038874,...,0.065653,0.000000,0.000000,0.000000,0.085126,0.065094,0.000000,0.000000,0.000000,0.0
403,Notice Historique 1-6,Jussieu,9,9__Jussieu_Notice Historique 1-6,0.049323,0.037618,0.031370,0.034061,0.056801,0.044428,...,0.000000,0.000000,0.148659,0.000000,0.085126,0.065094,0.000000,0.000000,0.000000,0.0
404,Anecdotes litteraires3 1756,Raynal,9,9__Raynal_Anecdotes litteraires3 1756,0.040263,0.024451,0.024648,0.025546,0.016567,0.042577,...,0.065653,0.000000,0.000000,0.000000,0.000000,0.000000,0.118888,0.000000,0.000000,0.0


In [164]:
# select an "unknown" work

problems = raw_df[raw_df.work == "Notice Historique 1-6"].reset_index(drop=True).copy()
problems_X = X[raw_df.work == "Notice Historique 1-6"].reset_index(drop=True).copy()
problems

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Notice Historique 1-6,4,Jussieu,0,0__Jussieu_Notice Historique 1-6
1,Notice Historique 1-6,4,Jussieu,1,1__Jussieu_Notice Historique 1-6
2,Notice Historique 1-6,4,Jussieu,10,10__Jussieu_Notice Historique 1-6
3,Notice Historique 1-6,4,Jussieu,11,11__Jussieu_Notice Historique 1-6
4,Notice Historique 1-6,4,Jussieu,12,12__Jussieu_Notice Historique 1-6
5,Notice Historique 1-6,4,Jussieu,13,13__Jussieu_Notice Historique 1-6
6,Notice Historique 1-6,4,Jussieu,14,14__Jussieu_Notice Historique 1-6
7,Notice Historique 1-6,4,Jussieu,15,15__Jussieu_Notice Historique 1-6
8,Notice Historique 1-6,4,Jussieu,16,16__Jussieu_Notice Historique 1-6
9,Notice Historique 1-6,4,Jussieu,17,17__Jussieu_Notice Historique 1-6


In [165]:
# extract the rest of the corpus

rest = raw_df[raw_df.work != "Notice Historique 1-6"].reset_index(drop=True).copy()
rest_X = X[raw_df.work != "Notice Historique 1-6"].reset_index(drop = True).copy()
rest

Unnamed: 0,work,author_label,author,chunk_num,tag
0,Essai sur la vie,0,Deleyre,0,0__Deleyre_Essai sur la vie
1,Idees sur l-education,0,Deleyre,0,0__Deleyre_Idees sur l-education
2,ethocratie,1,dHolbach,0,0__dHolbach_ethocratie
3,Système Social 3,1,dHolbach,0,0__dHolbach_Système Social 3
4,Pensees philosophiques,2,Diderot II,0,0__Diderot II_Pensees philosophiques
...,...,...,...,...,...
375,Principes de la Philosophie morale,3,Diderot,9,9__Diderot_Principes de la Philosophie morale
376,Suite de l-apologie,3,Diderot,9,9__Diderot_Suite de l-apologie
377,HDI T1-4,4,Jussieu,9,9__Jussieu_HDI T1-4
378,Anecdotes litteraires3 1756,5,Raynal,9,9__Raynal_Anecdotes litteraires3 1756


In [166]:
# Scaling
ss = StandardScaler(with_mean=False).fit(rest_X)

rest_scaled_X = ss.transform(rest_X)
problems_scaled_X = ss.transform(problems_X)

In [167]:
bdi_mm.fit(rest_scaled_X, rest.author_label)

for label in label_uniques.values:
    print(f"Testing against {label}")
    code = label_uniques.get_loc(label)
    print(
        f"Bootstrap Match Strength (one per chunk, 0-1.0): {bdi_mm.predict_proba(problems_scaled_X, [code] * problems_scaled_X.shape[0])}"
    )

12/16/2024 07:22:59 [ruzicka:INFO] Fitting on 380 documents...
12/16/2024 07:22:59 [ruzicka:INFO] Predicting on 26 documents


Testing against Deleyre


12/16/2024 07:23:03 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.571 0.739 0.543 0.468 0.556 0.445 0.64  0.601 0.635 0.723 0.674 0.483
 0.514 0.622 0.504 0.763 0.762 0.7   0.801 0.494 0.451 0.545 0.423 0.386
 0.483 0.607]
Testing against dHolbach


12/16/2024 07:23:07 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.51  0.474 0.197 0.233 0.09  0.266 0.173 0.348 0.379 0.476 0.526 0.07
 0.581 0.374 0.257 0.437 0.209 0.517 0.583 0.267 0.25  0.214 0.282 0.14
 0.256 0.462]
Testing against Diderot II


12/16/2024 07:23:09 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.141 0.111 0.132 0.01  0.087 0.124 0.055 0.026 0.032 0.194 0.138 0.142
 0.217 0.039 0.038 0.598 0.058 0.238 0.04  0.072 0.045 0.026 0.062 0.017
 0.025 0.026]
Testing against Diderot


12/16/2024 07:23:12 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.35  0.655 0.107 0.16  0.059 0.063 0.134 0.18  0.201 0.275 0.345 0.159
 0.772 0.103 0.124 0.296 0.065 0.2   0.322 0.218 0.347 0.15  0.08  0.122
 0.214 0.137]
Testing against Jussieu


12/16/2024 07:23:14 [ruzicka:INFO] Predicting on 26 documents


Bootstrap Match Strength (one per chunk, 0-1.0): [0.955 0.772 0.632 0.678 0.623 0.751 0.491 0.897 0.912 0.867 0.85  0.879
 0.793 0.975 0.86  0.857 0.778 0.825 0.801 0.709 0.513 0.81  0.787 0.606
 0.844 0.555]
Testing against Raynal
Bootstrap Match Strength (one per chunk, 0-1.0): [0.778 0.827 0.972 0.988 0.979 0.98  0.982 0.891 0.851 0.824 0.846 0.936
 0.662 0.817 0.943 0.608 0.865 0.833 0.709 0.986 0.987 0.948 0.956 0.99
 0.936 0.983]


In [169]:
authors = label_uniques.values

fh = ''

for a in authors:

    print(a)
    
    bdi_mm.predict_proba(
        problems_scaled_X, [label_uniques.get_loc(a)] * problems_scaled_X.shape[0]
    )

    
    x = pd.DataFrame(dict(zip(problems.tag, bdi_mm._dist_arrays)))
    
    fh = '0_tests_diderot-ii/5_l2_jussieu_vs_' + a + '.csv'

    x.to_csv(fh)

12/16/2024 07:24:14 [ruzicka:INFO] Predicting on 26 documents


Deleyre


12/16/2024 07:24:17 [ruzicka:INFO] Predicting on 26 documents


dHolbach


12/16/2024 07:24:21 [ruzicka:INFO] Predicting on 26 documents


Diderot II


12/16/2024 07:24:24 [ruzicka:INFO] Predicting on 26 documents


Diderot


12/16/2024 07:24:27 [ruzicka:INFO] Predicting on 26 documents


Jussieu


12/16/2024 07:24:29 [ruzicka:INFO] Predicting on 26 documents


Raynal
