# SatiSPeech 2025 Submission example with the baselines
This codalab is an example of how to submit the baseline based on BoW and MFCC feature models for the shared-task SatiSPeech 2025.

Here we show how to load the training dataset, how to extract the MFCC features from the audios and train 2 baseline based on Suppor Vector Machines with text features and acoustic features, and generate the submission file.

More information regarding the shared task can be found at: https://codalab.lisn.upsaclay.fr/competitions/21501

**IMPORTANT NOTE:** Please, note that for obtaining a valid KEY to download the audio segments, you should contact with the organisers using the Codalab plataform.


In [None]:
# The first step is download the required libraries
%pip install librosa
%pip install pandas
%pip install -U scikit-learn

In [1]:

 # Load the required libraries
import pandas as pd
import numpy as np
import librosa
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
# Download training and test audios and CSV files
# Please, replace %PASTE_YOUR_ID_HERE% with the API key provided.
# !wget https://pln.inf.um.es/corpora/satispeech/2025/dataset/download/train?api_key=80d959b40c740d703854c35610218664 -O segments_train.zip
# !wget https://pln.inf.um.es/corpora/satispeech/2025/dataset/download/test?api_key=80d959b40c740d703854c35610218664 -O segments_test.zip
# !wget https://pln.inf.um.es/corpora/satispeech/2025/SatiSPeech_phase_2_train_public.csv -O SatiSPeech_phase_2_train_public.csv
# !wget https://pln.inf.um.es/corpora/satispeech/2025/SatiSPeech_phase_2_test_public.csv -O SatiSPeech_phase_2_test_public.csv
# Unzip the audios file
# !unzip segments_train.zip -d train_segments
# !unzip segments_test.zip -d test_segments

# Load Data

In [10]:
# Read the CSV files
path = "data/public_data/"
train_df = pd.read_csv(path + "SatiSPeech_phase_2_train_public.csv")
test_df = pd.read_csv(path + "SatiSPeech_phase_2_test_public.csv")
test_df

Unnamed: 0,uid,transcription
0,fa8196b4-166b7bb9.mp3,La Comunidad de Madrid hace frente así a la sa...
1,b407b2e7-63887829.mp3,Los féretros han sido trasladados a Chiclana y...
2,91cfff91-f54d3c31.mp3,"Esto que parece una obviedad, ser contundentes..."
3,27e25c21-1cf4072b.mp3,Usted mismo reconoce que cada vez se parece má...
4,bb4ddfe5-fe837a69.mp3,Vamos a escuchar a todos los miembros del Comi...
...,...,...
1995,0dc2352f-e334aca3.mp3,"Tratan de superarlo en estos centros, pero a d..."
1996,830f9896-e3b18f45.mp3,"Afuera, seguidores del partido de extrema dere..."
1997,31c41168-febf1975.mp3,"Niña, una muestra que reúne 48 obras de arte c..."
1998,975631e6-fb7e4017.mp3,Pero no hemos conseguido esa respuesta unida q...


In [11]:
# Adding the path column to search for audios
train_df["path"] = train_df["id"].apply(lambda x: f"{path}segments_train/{x}")
test_df["path"] = test_df["uid"].apply(lambda x: f"{path}segments_test/{x}")
test_df

Unnamed: 0,uid,transcription,path
0,fa8196b4-166b7bb9.mp3,La Comunidad de Madrid hace frente así a la sa...,data/public_data/segments_test/fa8196b4-166b7b...
1,b407b2e7-63887829.mp3,Los féretros han sido trasladados a Chiclana y...,data/public_data/segments_test/b407b2e7-638878...
2,91cfff91-f54d3c31.mp3,"Esto que parece una obviedad, ser contundentes...",data/public_data/segments_test/91cfff91-f54d3c...
3,27e25c21-1cf4072b.mp3,Usted mismo reconoce que cada vez se parece má...,data/public_data/segments_test/27e25c21-1cf407...
4,bb4ddfe5-fe837a69.mp3,Vamos a escuchar a todos los miembros del Comi...,data/public_data/segments_test/bb4ddfe5-fe837a...
...,...,...,...
1995,0dc2352f-e334aca3.mp3,"Tratan de superarlo en estos centros, pero a d...",data/public_data/segments_test/0dc2352f-e334ac...
1996,830f9896-e3b18f45.mp3,"Afuera, seguidores del partido de extrema dere...",data/public_data/segments_test/830f9896-e3b18f...
1997,31c41168-febf1975.mp3,"Niña, una muestra que reúne 48 obras de arte c...",data/public_data/segments_test/31c41168-febf19...
1998,975631e6-fb7e4017.mp3,Pero no hemos conseguido esa respuesta unida q...,data/public_data/segments_test/975631e6-fb7e40...


## Task 1: Satire identification with only text features

In [12]:
# Stop words
stop_words = []


# Create a TFIDF Vectorizer using sci-kit. With this, we are going to represent all texts
# as counts of the vocabulary.
vectorizer = TfidfVectorizer(
    analyzer="word", max_features=50_000, lowercase=False, stop_words=stop_words
)


# Get the TF-IDF values from the training set
text_x_train = vectorizer.fit_transform(train_df["transcription"])

# Get the TF-IDF values from the test set
# Note that we apply the TF-IDF learned from the training split
text_x_test = vectorizer.transform(test_df["transcription"])


# We are going to store a baseline per dimension
baselines = {}


# Get a baseline classifier
baselines["label"] = LinearSVC(dual="auto")


scaler = MinMaxScaler()
text_x_train = scaler.fit_transform(text_x_train.toarray())
text_x_test = scaler.fit_transform(text_x_test.toarray())

# Train the baseline for this label
baselines["label"].fit(text_x_train, train_df["label"])

In [13]:
# Get the predictions for task 1 (based on text)
text_predictions = baselines["label"].predict(text_x_test)

## Task 2: Satire identification with MFCC and text features of audios with the SVM model.

In [14]:
# Extract MFCC features from audios
def extract_features(data, sample_rate):
    result = np.array([])
    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))

    return result

def get_features(path):
    # load all audio
    data, sample_rate = librosa.load(path)
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)

    return result

# Get list of train with MFCC features and label list
def get_features_labels(audio_df, label2id):
    X, Y = [], []
    for path, label in zip(audio_df.path, audio_df.label):
        feature = get_features(path)
        X.append(feature)
        Y.append(label2id[label])
    return X, Y

# Get list of test with MFCC features
def get_features_test (audio_df):
    X = []
    for path in audio_df.path:
        feature = get_features(path)
        X.append(feature)
    return X

In [15]:
# Get list of labels, id2label and label2id
labels = sorted(train_df.label.unique().tolist())
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}
print(id2label)

{0: 'no-satire', 1: 'satire'}


In [16]:
# Get input for the training
mfcc_x_train, y_train = get_features_labels(train_df, label2id)
print("Examples: ", mfcc_x_train[0], y_train[0])

Examples:  [-2.27943665e+02  1.12409912e+02  2.10512295e-01  2.80925350e+01
  1.56655502e+01 -5.94680929e+00 -2.95352592e+01 -1.02050018e+00
 -6.60770035e+00 -8.49475479e+00 -4.37865067e+00  9.26320851e-01
 -1.07230082e+01  4.57672596e+00 -3.64640665e+00  1.84385443e+00
 -8.69384956e+00 -1.03147376e+00 -4.64386511e+00 -7.17984819e+00] 1


In [None]:
# Get input for the test
mfcc_x_test = get_features_test(test_df)
print("Examples: ", mfcc_x_test)

In [None]:
multi_x_train = np.concatenate((text_x_train, mfcc_x_train),axis=1)
multi_x_test = np.concatenate((text_x_test, mfcc_x_test),axis=1)

In [None]:
# Scaling our data with sklearn's Standard scaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(multi_x_train)
x_test = scaler.transform(multi_x_test)

In [None]:
grid = SVC()
grid.fit(x_train, y_train)

In [None]:
grid_predictions = grid.predict(x_test)
mfcc_predictions = [id2label[label] for label in grid_predictions]

# Generation of the submission file
Finally, an output file is generated with the predictions in the format required for submission to CodaLab.

In [None]:
output_df = pd.DataFrame(columns=["id", "task_1", "task_2"])
output_df["id"] = test_df["id"].str.replace ('.mp3', '', regex = False)
output_df["task_1"] = text_predictions
output_df["task_2"] = mfcc_predictions
print (output_df)
output_df.to_csv ('results.csv', index = False)