In [1]:
import pandas as pd
import numpy as np

In [2]:
# extraction de la moyenne des notes et leurs nombre par scène pour un type de média
def get_media_type(annot, noteurs, media='audio'):
    """
    Parameters:
        DataFrame annot : csv des annotations (modifié)
        list noteurs : liste des annotateurs
        str media : nom du média ('audio'/'video'/'texte')
    Returns:
        DataFrame df_m  : csv qui pour chaque scene contenant le media donne le nombre et la moyenne des notes
        DataFrame df_mo : csv qui pour chaque scene contenant uniquement le media, donne le nombre et la moyenne des notes
    """
    
    les_medias = set(['audio','video','texte'])
    les_medias.difference(set([media]))
    
    filtres = {}
    filtres['isRated'] = annot['isRated'] == 1
    filtres[media]     = annot[media] == 1
    f = pd.DataFrame(filtres).apply(sum, axis=1) == len(filtres)
    df_media = annot.loc[f,:].copy()
    
    for m in les_medias.difference(set([media])):
        filtres[m] = annot[m] == 0
    f = pd.DataFrame(filtres).apply(sum, axis=1) == len(filtres)
    df_media_only = annot.loc[f,:].copy()
    
    if len(df_media) > 0:
        df_media['moyenne'] = df_media[noteurs].apply(np.nanmean, axis=1)
        df_m = df_media[['doc','moyenne']].groupby(['doc']).agg(['count','mean']).droplevel(level=0, axis=1)
        df_m = df_m.sort_values('doc').reset_index()
        print(media," : ",      len(set(df_media['doc'])), sep="")
    else : df_m = None
    
    if len(df_media_only) > 0:
        df_media_only['moyenne'] = df_media_only[noteurs].apply(np.nanmean, axis=1)
        df_mo = df_media_only[['doc','moyenne']].groupby(['doc']).agg(['count','mean']).droplevel(level=0, axis=1)
        df_mo = df_mo.sort_values('doc').reset_index()
        print(media,"_only : ", len(set(df_media_only['doc'])), sep="")
    else : df_mo = None
    
    return df_m, df_mo

In [3]:
# modification du csv des annotations
def transform_annotation(annot):
    """
    Parameters:
        DataFrame annot : csv original des annotations
    Returns:
        list annot : csv modififé : remplacement de la colonne code_doc
        DataFrame noteurs : liste des annotateurs
    """
    annot = annot.replace(-1,np.nan)
    noteurs = annot.columns[1:]

    annot['doc']     = ["_".join(e.split("_")[0:2]) for e in annot["code_doc"]]
    annot['audio']   = [int(e.split("_")[2][0])     for e in annot["code_doc"]]
    annot['video']   = [int(e.split("_")[2][1])     for e in annot["code_doc"]]
    annot['texte']   = [int(e.split("_")[2][2])     for e in annot["code_doc"]]
    annot['isRated'] = [int(e.split("_")[3])        for e in annot["code_doc"]]

    annot = annot.drop("code_doc", axis=1)
    
    return annot, noteurs

#### Ouverture du csv des annotations, changement des colonnes

In [4]:
samplerate = 16000

directory = './challenge-m2-sid/'
dir_audio = directory + 'corpus/audio/'
dir_texte = directory + 'corpus/text/'
dir_video = directory + 'corpus/video/'

spleeter_output_dir = './audio_output/'

audio_extension = "_mono.wav"
texte_extension = ".xml"
video_extension = ".m4v"

annot_origin = pd.read_csv(directory+'annotations_challenge_sid.csv', sep="\t", index_col=0, header=0)
annot, noteurs = transform_annotation(annot_origin)
annot.head(6)

Unnamed: 0,il08_09,vg04_05,fd03_04,la09_10,cg13_14,mb00_12,doc,audio,video,texte,isRated
1,,,,,,,119_8,1,0,0,0
2,,,,,,,119_8,1,1,0,0
3,,,,,,,119_8,1,1,1,0
4,,,,,,,119_8,1,0,1,0
5,,,,,,,119_8,0,0,1,0
6,,,,,,76.0,57_6,1,0,0,1


#### Selection des audio notés et calcul de la moyenne

In [5]:
audio, audio_only = get_media_type(annot, noteurs, media='audio')

audio : 55
audio_only : 33


In [6]:
video, video_only = get_media_type(annot, noteurs, media='video')

video : 44


In [7]:
texte, texte_only = get_media_type(annot, noteurs, media='texte')

texte : 55
texte_only : 33


In [10]:
audio.head()

Unnamed: 0,doc,count,mean
0,100_1,2,57.583333
1,107_7,2,60.5
2,108_3,2,49.833333
3,116_15,2,34.25
4,128_5,2,82.25


texte_only = annot[(annot['isGraded'] == 1) &
                   (annot['audio']    == 0) &
                   (annot['video']    == 0) &
                   (annot['texte']    == 1)
                  ].copy()

texte = annot[(annot['isGraded'] == 1) & 
              (annot['texte']    == 1)
             ].copy()

texte['moyenne']      = texte[noteurs].apply(np.nanmean, axis=1)
texte_only['moyenne'] = texte_only[noteurs].apply(np.nanmean, axis=1)

print("texte :",      len(set(texte['doc'])))
print("texte_only :", len(set(texte_only['doc'])))

texte_only[['doc','moyenne']].groupby(['doc']).agg(['count','mean']).head(7)