In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
corpus = "clef_v2" 
metric = "var"

full_corpus_name = {
    "2020": "misinfo-2020",
    "2021": "C4-2021",
    "2022": "C4-2022",
    "CLEF": "CLEF",
    "clef": "CLEF",
    "CLEF_v2": "CLEF_v2",
    "clef_v2": "CLEF_v2"
}

fields = {
    '2020': 'title',
    '2021': 'query',
    '2022': 'query',
    'CLEF': 'title',
    'clef': 'title',
    "CLEF_v2": "title",
    "clef_v2": "title"
}


In [63]:
scores = pd.read_csv(f"pre_qpp/{full_corpus_name[corpus]}_{metric}_{fields[corpus]}.csv", index_col=0)
scores

Unnamed: 0,sigma_1,sigma_2,sigma_3
151,116.101817,19.350303,35.501125
152,49.981538,24.990769,43.825776
153,47.67941,15.893137,23.780222
154,68.992798,22.997599,29.555744
155,62.268474,31.134237,34.403428
156,49.265697,16.421899,39.118725
157,23.072107,11.536054,11.787952
158,24.312253,12.156127,23.75345
159,61.565769,20.521923,40.403816
160,0.0,0.0,0.0


In [64]:
scores.describe()

Unnamed: 0,sigma_1,sigma_2,sigma_3
count,50.0,50.0,50.0
mean,72.678526,21.732731,36.064151
std,46.043639,10.81918,17.416743
min,0.0,0.0,0.0
25%,40.540496,13.813055,23.655463
50%,66.393116,21.095726,39.324614
75%,94.809058,31.153345,48.164995
max,180.678189,46.08402,71.213249


In [65]:
if metric != "var":
    scores = scores.sort_values(by=metric, ascending=False)

In [66]:
df = scores.reset_index(names="topic")
df

Unnamed: 0,topic,sigma_1,sigma_2,sigma_3
0,151,116.101817,19.350303,35.501125
1,152,49.981538,24.990769,43.825776
2,153,47.67941,15.893137,23.780222
3,154,68.992798,22.997599,29.555744
4,155,62.268474,31.134237,34.403428
5,156,49.265697,16.421899,39.118725
6,157,23.072107,11.536054,11.787952
7,158,24.312253,12.156127,23.75345
8,159,61.565769,20.521923,40.403816
9,160,0.0,0.0,0.0


In [67]:
if metric != "var":
    plt.figure(figsize=(8, 5))
    sns.histplot(df[metric], kde=True)
    plt.xlabel(f"{metric} - {corpus}", fontsize=14)
    plt.ylabel("Frecuencia")
    plt.title(f"Suma de scores de controversia - temps - {corpus}", fontsize=18, pad=15)
    plt.grid(True, alpha=0.5)

    plt.show()

In [68]:
ndcg_harm_2020 = pd.read_csv(f'ndcg_harmful_only_results/ndcg_harmful_only_output_bm25_2020.csv')
ndcg_harm_2021 = pd.read_csv(f'ndcg_harmful_only_results/ndcg_harmful_only_output_bm25_2021.csv')
ndcg_harm_2022 = pd.read_csv(f'ndcg_harmful_only_results/ndcg_harmful_only_output_bm25_2022.csv')
ndcg_harm_clef_v2 = pd.read_csv(f'ndcg_harmful_only_results/ndcg_harmful_only_output_bm25_CLEF.csv')

ndcg_harms = {"2020": ndcg_harm_2020, "2021": ndcg_harm_2021, "2022": ndcg_harm_2022, "clef_v2": ndcg_harm_clef_v2}

In [69]:
ndcg_harm_2020

Unnamed: 0,run,topic,ndcg_cut_5,ndcg_cut_10,ndcg_cut_15,ndcg_cut_20,ndcg_cut_30,ndcg_cut_100,ndcg_cut_200,ndcg_cut_500,ndcg_cut_1000
0,bm25_2020,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0173,0.0458,0.0724
1,bm25_2020,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bm25_2020,13,0.786,0.5101,0.3954,0.3292,0.253,0.2074,0.213,0.3151,0.4083
3,bm25_2020,14,0.0,0.0,0.0,0.0252,0.0568,0.2152,0.263,0.3256,0.3708
4,bm25_2020,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046,0.046
5,bm25_2020,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,bm25_2020,17,0.1461,0.0948,0.1161,0.1649,0.1735,0.1613,0.218,0.3154,0.3584
7,bm25_2020,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,bm25_2020,19,0.1461,0.1732,0.2279,0.2221,0.2626,0.2855,0.3691,0.3977,0.4039
9,bm25_2020,2,0.0,0.0,0.0,0.0,0.0,0.1968,0.2621,0.3072,0.3393


In [70]:
for k, ndcg_df in ndcg_harms.items():
    ndcg_harms[k] = ndcg_df[ndcg_df["topic"] != "all"]
    

ndcg_harms[corpus]

Unnamed: 0,run,topic,ndcg_cut_5,ndcg_cut_10,ndcg_cut_15,ndcg_cut_20,ndcg_cut_30,ndcg_cut_100,ndcg_cut_200,ndcg_cut_500,ndcg_cut_1000
0,bm25_2022,151,0.2246,0.2201,0.2185,0.1919,0.2206,0.3069,0.3069,0.3069,0.414
1,bm25_2022,152,0.0,0.0,0.0269,0.024,0.0593,0.0593,0.0942,0.1356,0.1626
2,bm25_2022,153,1.0,0.7885,0.6589,0.6137,0.5413,0.4066,0.5072,0.6315,0.6831
3,bm25_2022,154,0.0,0.2064,0.3387,0.3502,0.3397,0.1676,0.207,0.3113,0.4619
4,bm25_2022,155,0.214,0.2725,0.2901,0.3468,0.3058,0.4042,0.4897,0.5737,0.5882
5,bm25_2022,156,0.0,0.0,0.0,0.0,0.0,0.0727,0.0727,0.0727,0.0727
6,bm25_2022,157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,bm25_2022,158,0.2382,0.1546,0.1198,0.1795,0.1681,0.1575,0.202,0.2581,0.3348
8,bm25_2022,159,0.3156,0.5192,0.5186,0.5663,0.5766,0.4323,0.4596,0.6348,0.7323
9,bm25_2022,160,0.0,0.0,0.0,0.0,0.0,0.0,0.0374,0.0702,0.1288


In [71]:
if metric != "var":
    ndcg_harms[corpus][f"{metric}"] = df[f"{metric}"]
else:
    for i in range(1, 4):
        ndcg_harms[corpus][f"sigma_{i}"] = df[f"sigma_{i}"]


ndcg_harms[corpus]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndcg_harms[corpus][f"sigma_{i}"] = df[f"sigma_{i}"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndcg_harms[corpus][f"sigma_{i}"] = df[f"sigma_{i}"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndcg_harms[corpus][f"sigma_{i}"] = df[f"sigma_{i}"]


Unnamed: 0,run,topic,ndcg_cut_5,ndcg_cut_10,ndcg_cut_15,ndcg_cut_20,ndcg_cut_30,ndcg_cut_100,ndcg_cut_200,ndcg_cut_500,ndcg_cut_1000,sigma_1,sigma_2,sigma_3
0,bm25_2022,151,0.2246,0.2201,0.2185,0.1919,0.2206,0.3069,0.3069,0.3069,0.414,116.101817,19.350303,35.501125
1,bm25_2022,152,0.0,0.0,0.0269,0.024,0.0593,0.0593,0.0942,0.1356,0.1626,49.981538,24.990769,43.825776
2,bm25_2022,153,1.0,0.7885,0.6589,0.6137,0.5413,0.4066,0.5072,0.6315,0.6831,47.67941,15.893137,23.780222
3,bm25_2022,154,0.0,0.2064,0.3387,0.3502,0.3397,0.1676,0.207,0.3113,0.4619,68.992798,22.997599,29.555744
4,bm25_2022,155,0.214,0.2725,0.2901,0.3468,0.3058,0.4042,0.4897,0.5737,0.5882,62.268474,31.134237,34.403428
5,bm25_2022,156,0.0,0.0,0.0,0.0,0.0,0.0727,0.0727,0.0727,0.0727,49.265697,16.421899,39.118725
6,bm25_2022,157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.072107,11.536054,11.787952
7,bm25_2022,158,0.2382,0.1546,0.1198,0.1795,0.1681,0.1575,0.202,0.2581,0.3348,24.312253,12.156127,23.75345
8,bm25_2022,159,0.3156,0.5192,0.5186,0.5663,0.5766,0.4323,0.4596,0.6348,0.7323,61.565769,20.521923,40.403816
9,bm25_2022,160,0.0,0.0,0.0,0.0,0.0,0.0,0.0374,0.0702,0.1288,0.0,0.0,0.0


In [72]:
cols= ndcg_harms[corpus].columns
cols = cols.drop(["run", "topic"])
cols

Index(['ndcg_cut_5', 'ndcg_cut_10', 'ndcg_cut_15', 'ndcg_cut_20',
       'ndcg_cut_30', 'ndcg_cut_100', 'ndcg_cut_200', 'ndcg_cut_500',
       'ndcg_cut_1000', 'sigma_1', 'sigma_2', 'sigma_3'],
      dtype='object')

In [73]:
ndcg_harms[corpus][cols].corr(method="pearson").round(3)

Unnamed: 0,ndcg_cut_5,ndcg_cut_10,ndcg_cut_15,ndcg_cut_20,ndcg_cut_30,ndcg_cut_100,ndcg_cut_200,ndcg_cut_500,ndcg_cut_1000,sigma_1,sigma_2,sigma_3
ndcg_cut_5,1.0,0.904,0.823,0.748,0.682,0.639,0.656,0.64,0.635,-0.03,-0.04,-0.103
ndcg_cut_10,0.904,1.0,0.971,0.929,0.891,0.84,0.837,0.818,0.805,0.016,0.004,-0.053
ndcg_cut_15,0.823,0.971,1.0,0.971,0.948,0.86,0.845,0.845,0.845,0.1,0.073,0.017
ndcg_cut_20,0.748,0.929,0.971,1.0,0.98,0.88,0.87,0.871,0.862,0.105,0.067,0.026
ndcg_cut_30,0.682,0.891,0.948,0.98,1.0,0.905,0.878,0.875,0.861,0.171,0.108,0.092
ndcg_cut_100,0.639,0.84,0.86,0.88,0.905,1.0,0.974,0.924,0.882,0.114,0.063,0.06
ndcg_cut_200,0.656,0.837,0.845,0.87,0.878,0.974,1.0,0.968,0.919,0.033,0.031,0.01
ndcg_cut_500,0.64,0.818,0.845,0.871,0.875,0.924,0.968,1.0,0.975,0.073,0.096,0.089
ndcg_cut_1000,0.635,0.805,0.845,0.862,0.861,0.882,0.919,0.975,1.0,0.114,0.137,0.137
sigma_1,-0.03,0.016,0.1,0.105,0.171,0.114,0.033,0.073,0.114,1.0,0.809,0.853


In [74]:
ndcg_harms[corpus][cols].corr(method="kendall").round(3)

Unnamed: 0,ndcg_cut_5,ndcg_cut_10,ndcg_cut_15,ndcg_cut_20,ndcg_cut_30,ndcg_cut_100,ndcg_cut_200,ndcg_cut_500,ndcg_cut_1000,sigma_1,sigma_2,sigma_3
ndcg_cut_5,1.0,0.746,0.653,0.598,0.566,0.57,0.553,0.529,0.529,0.019,-0.026,-0.071
ndcg_cut_10,0.746,1.0,0.862,0.777,0.767,0.779,0.735,0.693,0.671,0.044,0.02,-0.04
ndcg_cut_15,0.653,0.862,1.0,0.869,0.838,0.739,0.701,0.674,0.675,0.118,0.086,0.027
ndcg_cut_20,0.598,0.777,0.869,1.0,0.924,0.791,0.766,0.73,0.705,0.133,0.089,0.036
ndcg_cut_30,0.566,0.767,0.838,0.924,1.0,0.811,0.772,0.749,0.712,0.163,0.129,0.098
ndcg_cut_100,0.57,0.779,0.739,0.791,0.811,1.0,0.862,0.792,0.718,0.117,0.077,0.062
ndcg_cut_200,0.553,0.735,0.701,0.766,0.772,0.862,1.0,0.882,0.796,0.105,0.08,0.05
ndcg_cut_500,0.529,0.693,0.674,0.73,0.749,0.792,0.882,1.0,0.89,0.109,0.103,0.079
ndcg_cut_1000,0.529,0.671,0.675,0.705,0.712,0.718,0.796,0.89,1.0,0.131,0.137,0.095
sigma_1,0.019,0.044,0.118,0.133,0.163,0.117,0.105,0.109,0.131,1.0,0.748,0.715


In [75]:
ndcg_harms[corpus][cols].corr(method="spearman").round(3)

Unnamed: 0,ndcg_cut_5,ndcg_cut_10,ndcg_cut_15,ndcg_cut_20,ndcg_cut_30,ndcg_cut_100,ndcg_cut_200,ndcg_cut_500,ndcg_cut_1000,sigma_1,sigma_2,sigma_3
ndcg_cut_5,1.0,0.863,0.788,0.733,0.731,0.73,0.716,0.697,0.695,0.034,-0.021,-0.09
ndcg_cut_10,0.863,1.0,0.951,0.91,0.904,0.915,0.88,0.844,0.817,0.072,0.034,-0.054
ndcg_cut_15,0.788,0.951,1.0,0.952,0.944,0.886,0.853,0.844,0.829,0.187,0.144,0.049
ndcg_cut_20,0.733,0.91,0.952,1.0,0.986,0.92,0.904,0.889,0.859,0.204,0.149,0.057
ndcg_cut_30,0.731,0.904,0.944,0.986,1.0,0.936,0.915,0.908,0.879,0.238,0.202,0.128
ndcg_cut_100,0.73,0.915,0.886,0.92,0.936,1.0,0.97,0.932,0.885,0.168,0.117,0.089
ndcg_cut_200,0.716,0.88,0.853,0.904,0.915,0.97,1.0,0.972,0.937,0.133,0.107,0.064
ndcg_cut_500,0.697,0.844,0.844,0.889,0.908,0.932,0.972,1.0,0.978,0.146,0.133,0.094
ndcg_cut_1000,0.695,0.817,0.829,0.859,0.879,0.885,0.937,0.978,1.0,0.186,0.181,0.132
sigma_1,0.034,0.072,0.187,0.204,0.238,0.168,0.133,0.146,0.186,1.0,0.873,0.872
