In [4]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

## Examples

In [107]:
path_meta = "./tsv_files/gazeta/mean_last_layer_doc21_metadata.tsv"
path_vectors = "./tsv_files/gazeta/mean_last_layer_doc22_vectors.tsv"

In [None]:
vectors = pd.read_csv(path_vectors, sep="\t", header=None).dropna(axis=1)

In [16]:
meta = pd.read_csv(path_meta, sep="\t")

In [24]:
vectors.join(meta).groupby(["Profession", "Name"]).mean().reset_index()

Unnamed: 0,Profession,Name,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,b'politycy',b'Leszek Miller',-0.053149,-0.298815,0.296415,0.104216,0.299093,0.221196,-0.213263,-0.091908,...,0.137453,0.292715,-0.411608,-0.815928,-0.013818,-0.453147,-0.02136,-0.074892,-0.05515,-0.050677


In [93]:
vectors = [
    "./tsv_files/gazeta/mean_last_layer_doc21_vectors.tsv",
    "./tsv_files/gazeta/mean_last_layer_doc22_vectors.tsv"
]

In [94]:
metadatas = [
    "./tsv_files/gazeta/mean_last_layer_doc21_metadata.tsv",
    "./tsv_files/gazeta/mean_last_layer_doc22_metadata.tsv"
]

## Code

In [7]:
def generate_concat_vectors_meta_df(corpus, doc_position=3):
    path_tsv = os.path.join("tsv_files", corpus)
    metadatas = [file for file in os.listdir(path_tsv) if file.endswith("metadata.tsv")]
    vectors = [file for file in os.listdir(path_tsv) if file.endswith("vectors.tsv")]
    
    vectors_meta_df = pd.DataFrame(columns=range(771)).rename(columns=
    {
        768: "Name",
        769: "Profession",
        770: "Document"
    })

    for vector_file, metadata_file in tqdm(zip(vectors, metadatas)):
        vector_doc = vector_file.split("_")[doc_position]
        metadata_doc = metadata_file.split("_")[doc_position]
        vector_path = os.path.join(path_tsv, vector_file)
        metadata_path = os.path.join(path_tsv, metadata_file)
        try:
            if metadata_doc != vector_doc:
                raise ValueError("Metada doc ({}) doesn't find counterpart with vector doc ({}).".format(metadata_doc, vector_doc))
            vector = pd.read_csv(vector_path, sep="\t", header=None).dropna(axis=1)
            metadata = pd.read_csv(metadata_path, sep="\t")
        except pd.errors.EmptyDataError:
            continue
        else:
            joined = vector.join(metadata)
            joined["Document"] = metadata_doc
            vectors_meta_df = vectors_meta_df.append(joined)
            
    return vectors_meta_df

In [9]:
df = generate_concat_vectors_meta_df("gazeta", 3)

5013it [02:05, 39.94it/s]


### Doc level

In [10]:
df.groupby(["Document", "Profession", "Name"]).mean().reset_index()

Unnamed: 0,Document,Profession,Name,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,doc1,b'dziennikarze',b'Tomasz Sekielski',0.058479,-0.308485,0.421929,0.165078,0.362700,0.320804,-0.222915,...,0.091627,0.242734,-0.360961,-0.805343,0.097153,-0.424075,0.082262,0.029958,0.088789,-0.014945
1,doc1,b'muzycy',b'Marian Filar',-0.119539,-0.420205,0.545161,-0.098551,0.253041,0.353114,-0.266239,...,0.154645,0.158944,-0.241158,-0.912108,0.088434,-0.475515,0.013830,-0.135297,0.096832,-0.085748
2,doc1002,b'duchowni',b'Wojciech Polak',0.007306,-0.416092,0.468451,-0.018035,0.444621,0.204554,-0.175865,...,0.137490,0.178116,-0.320958,-1.096421,0.127237,-0.420452,-0.014561,0.120996,0.016708,-0.059699
3,doc1006,b'politycy',b'Jaros\xc5\x82aw Gowin',-0.047277,-0.327853,0.502921,-0.064244,0.226270,0.344187,-0.300658,...,-0.048855,0.066008,0.012932,-0.591326,0.098871,-0.661239,0.094611,0.169894,0.334156,0.073045
4,doc1008,b'duchowni',b'J\xc3\xb3zef Weso\xc5\x82owski',-0.051523,-0.415811,0.595523,0.068917,0.383640,0.185276,-0.278150,...,0.177425,0.269249,-0.416216,-0.959737,0.156044,-0.638221,-0.024258,-0.247338,0.028799,0.035575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3744,doc995,b'politycy',b'Bogdan Zdrojewski',0.052833,-0.320129,0.484427,-0.007471,0.145078,0.237572,-0.198483,...,0.100246,0.348333,-0.378476,-1.153100,0.079319,-0.633115,0.007265,0.002785,-0.000901,-0.044325
3745,doc997,b'politycy',b'Antoni Macierewicz',-0.057892,-0.328643,0.314875,0.114533,0.249958,0.179329,-0.352350,...,0.141055,0.214412,-0.556027,-0.986699,0.185024,-0.563177,-0.110665,-0.166555,0.017993,-0.003608
3746,doc997,b'politycy',b'Donald Tusk',-0.218511,-0.373466,0.486579,0.083028,0.515736,0.320591,-0.179105,...,0.003391,0.211865,-0.618961,-0.629303,0.117287,-0.432821,-0.226312,-0.034219,0.234341,-0.045998
3747,doc997,b'politycy',b'Janusz Palikot',-0.056357,-0.293092,0.285346,0.113731,0.362395,0.073614,-0.117403,...,0.097124,0.262120,-0.531003,-0.729915,0.054376,-0.464328,-0.060221,-0.144925,0.141181,-0.056444


### Corpus level

In [11]:
df.groupby(["Profession", "Name"]).mean().reset_index()

Unnamed: 0,Profession,Name,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,b'</Entity> (syn Jerzego) wspomina\xc5\x82 wyp...,b'jsza</Entity> (syn Jerzego) wspomina\xc5\x82...,-0.246644,-0.330295,0.447255,-0.030376,0.633781,0.011029,-0.239971,-0.061875,...,0.321495,0.143844,-0.247444,-1.008738,-0.003344,-0.654104,-0.241338,-0.062569,0.270372,-0.070904
1,b'<> nie jest obra\xc5\xbaliw',b'nie <> nie jest obra\xc5\xbaliw',0.136635,-0.312314,0.281029,0.040098,0.252375,0.176406,-0.152800,0.088950,...,0.269734,0.362993,-0.576451,-0.657152,-0.105935,-0.379808,0.166326,-0.078753,-0.123291,-0.054773
2,b'Entity> w poniedzia\xc5\x82kowym wywiadzie d...,b'er</Entity> w poniedzia\xc5\x82kowym wywiadz...,-0.025960,-0.405112,0.457265,0.183440,0.537095,-0.047013,-0.328605,0.062313,...,0.198080,0.232919,-0.560682,-1.144230,0.238334,-0.455821,-0.232863,-0.018041,0.068249,-0.126910
3,b'aktorzy',b'Adam Kwiatkowski',-0.048943,-0.333173,0.516826,0.072633,0.305881,0.133955,-0.280899,0.107676,...,0.148132,0.141833,-0.387489,-1.065905,0.135772,-0.496106,-0.064107,-0.171653,0.030039,0.010748
4,b'aktorzy',b'Agnieszka Grochowska',0.042069,-0.263897,0.381282,0.059005,0.435345,0.095232,-0.147581,0.181513,...,0.030447,0.274701,-0.337066,-0.565219,-0.002730,-0.344867,-0.006459,-0.005338,0.162914,-0.168167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,b'sportowcy',b'W\xc5\x82adys\xc5\x82aw Kozakiewicz',0.037530,-0.309757,0.446652,0.122279,0.380043,0.303005,-0.217939,-0.003605,...,0.132109,0.193326,-0.454307,-0.864561,0.019423,-0.511740,0.031524,-0.082569,0.022180,-0.089853
751,b'sportowcy',b'Wojciech Fortuna',-0.096425,-0.312910,0.232869,-0.036245,0.194447,0.407266,-0.267457,-0.095037,...,0.368190,0.371367,-0.109406,-0.640785,0.064444,-0.402506,0.255214,0.175263,-0.001681,-0.020489
752,b'sportowcy',b'Wojciech Kowalski',0.128646,-0.473450,0.154396,0.027069,0.350751,0.125189,-0.297642,0.202375,...,0.062077,0.090626,-0.610336,-0.932492,0.115857,-0.410924,0.058882,0.146254,-0.054190,0.029887
753,b'sportowcy',b'Zbigniew Boniek',-0.011007,-0.397946,0.476508,0.062485,0.358457,0.195063,-0.164216,0.063111,...,0.134404,0.228986,-0.428840,-0.966150,0.078099,-0.546357,-0.068032,-0.074869,0.054510,-0.022043
