# SatiSPeech Submission example with the baselines
This codalab is an example of how to submit the baseline based on BoW and MFCC feature models for the shared-task SatiSPeech 2025.

Here we show how to load the training dataset, how to extract the MFCC features from the audios and train 2 baseline based on Suppor Vector Machines with text features and acoustic features.

More information regarding the shared task can be found at: https://codalab.lisn.upsaclay.fr/competitions/21501

**IMPORTANT NOTE:** Please, note that for obtaining a valid KEY to download the audio segments, you should contact with the organisers using the Codalab plataform.


In [1]:
# The first step is download the required libraries
%pip install librosa
%pip install pandas
%pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
 # Load the required libraries
import pandas as pd
import numpy as np
import librosa
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
# Download training and test audios and CSV files
# Please, replace %PASTE_YOUR_ID_HERE% with the API key provided.
# !wget https://pln.inf.um.es/corpora/satispeech/2025/dataset/download/dev?api_key=[80d959b40c740d703854c35610218664] -O segments.zip
# !wget https://pln.inf.um.es/corpora/satispeech/2025/dataset/download/dev?api_key=80d959b40c740d703854c35610218664 -O segments.zip
# !wget https://pln.inf.um.es/corpora/satispeech/2025/SatiSPeech_phase_1_train_codalab.csv -O SatiSPeech_phase_1_train_codalab.csv
# !wget https://pln.inf.um.es/corpora/satispeech/2025/SatiSPeech_phase_1_test_codalab.csv -O SatiSPeech_phase_1_test_codalab.csv
# !mkdir dev_segments/
# !unzip segments.zip -d dev_segments

# Load Data

In [3]:
# Read the CSV files
path = "data/reference_data/"
train_df = pd.read_csv(path + "SatiSPeech_phase_1_train_codalab.csv")
test_df = pd.read_csv(path + "SatiSPeech_phase_1_test_codalab.csv")
train_df

Unnamed: 0,id,label,transcription
0,3889a67c-6b948c2a.mp3,satire,"A veces no nos fijamos, parece que con darles ..."
1,aeb4e7f5-0b1d0ac6.mp3,satire,"Buenas noches, hoy en Documentos TV les ofrece..."
2,49e38957-8400221a.mp3,no-satire,El grupo de países sudamericanos UNASUR discut...
3,cbadf2d7-b8874b6d.mp3,no-satire,15 días le ha dado el juez Santiago Pedraz al ...
4,5dfe1dd7-9c90b9e2.mp3,no-satire,Hay un puñado de experimentos asignados a dife...
...,...,...,...
379,fa5e948b-3c26867e.mp3,no-satire,El gobierno de Ucrania dijo que continuará su ...
380,056d52aa-fae45a22.mp3,no-satire,Las reacciones a esta nueva tragedia no se han...
381,eeea6af3-83e94953.mp3,no-satire,Un equipo de científicos decodificó el ADN de ...
382,512b1051-5ef2d5f1.mp3,no-satire,"En el puesto número 2, una fiesta navideña en ..."


In [4]:
# Adding the path column to search for audios
train_df["path"] = train_df["id"].apply(lambda x: f"{path}dev_segments/{x}")
test_df["path"] = test_df["id"].apply(lambda x: f"{path}dev_segments/{x}")
train_df

Unnamed: 0,id,label,transcription,path
0,3889a67c-6b948c2a.mp3,satire,"A veces no nos fijamos, parece que con darles ...",data/reference_data/dev_segments/3889a67c-6b94...
1,aeb4e7f5-0b1d0ac6.mp3,satire,"Buenas noches, hoy en Documentos TV les ofrece...",data/reference_data/dev_segments/aeb4e7f5-0b1d...
2,49e38957-8400221a.mp3,no-satire,El grupo de países sudamericanos UNASUR discut...,data/reference_data/dev_segments/49e38957-8400...
3,cbadf2d7-b8874b6d.mp3,no-satire,15 días le ha dado el juez Santiago Pedraz al ...,data/reference_data/dev_segments/cbadf2d7-b887...
4,5dfe1dd7-9c90b9e2.mp3,no-satire,Hay un puñado de experimentos asignados a dife...,data/reference_data/dev_segments/5dfe1dd7-9c90...
...,...,...,...,...
379,fa5e948b-3c26867e.mp3,no-satire,El gobierno de Ucrania dijo que continuará su ...,data/reference_data/dev_segments/fa5e948b-3c26...
380,056d52aa-fae45a22.mp3,no-satire,Las reacciones a esta nueva tragedia no se han...,data/reference_data/dev_segments/056d52aa-fae4...
381,eeea6af3-83e94953.mp3,no-satire,Un equipo de científicos decodificó el ADN de ...,data/reference_data/dev_segments/eeea6af3-83e9...
382,512b1051-5ef2d5f1.mp3,no-satire,"En el puesto número 2, una fiesta navideña en ...",data/reference_data/dev_segments/512b1051-5ef2...


## Task 1: Satire identification with only text features

In [8]:
# Stop words
stop_words = []


# Create a TFIDF Vectorizer using sci-kit. With this, we are going to represent all texts
# as counts of the vocabulary.
vectorizer = TfidfVectorizer(
    analyzer="word", max_features=50_000, lowercase=False, stop_words=stop_words
)


# Get the TF-IDF values from the training set
text_x_train = vectorizer.fit_transform(train_df["transcription"])

# Get the TF-IDF values from the test set
# Note that we apply the TF-IDF learned from the training split
text_x_test = vectorizer.transform(test_df["transcription"])


# We are going to store a baseline per dimension
baselines = {}


# Get a baseline classifier
baselines["label"] = LinearSVC(dual="auto")

scaler = MinMaxScaler()
text_x_train = scaler.fit_transform(text_x_train.toarray())
text_x_test = scaler.fit_transform(text_x_test.toarray())

# Train the baseline for this label
baselines["label"].fit(text_x_train, train_df["label"])

In [9]:
text_predictions = baselines["label"].predict(text_x_test)
print(
    classification_report(test_df["label"], text_predictions, digits=6, zero_division=0)
)

              precision    recall  f1-score   support

   no-satire   0.918367  0.937500  0.927835        48
      satire   0.936170  0.916667  0.926316        48

    accuracy                       0.927083        96
   macro avg   0.927269  0.927083  0.927075        96
weighted avg   0.927269  0.927083  0.927075        96



## Task 2: Satire identification with MFCC and text features of audios with the SVM model.

In [10]:
# Extract MFCC features from audios
def extract_features(data, sample_rate):
    result = np.array([])
    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))

    return result

def get_features(path):
    # load all audio
    data, sample_rate = librosa.load(path)
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)

    return result

# Get list of train with MFCC features and label list
def get_features_labels(audio_df, label2id):
    X, Y = [], []
    for path, satire in zip(audio_df.path, audio_df.label):
        feature = get_features(path)
        X.append(feature)
        Y.append(label2id[satire])
    return X, Y

In [11]:
# Get list of labels, id2label and label2id
labels = sorted(train_df.label.unique().tolist())
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}
print(id2label)

{0: 'no-satire', 1: 'satire'}


In [12]:
# Get input for the training
mfcc_x_train, y_train = get_features_labels(train_df, label2id)
print("Examples: ", mfcc_x_train[0], y_train[0])

Examples:  [-2.65515594e+02  1.34025452e+02 -3.87320614e+00  4.18447304e+01
  4.41900396e+00 -1.72719326e+01 -3.18521729e+01 -1.42884254e+00
 -1.95240421e+01 -1.31922884e+01  1.55874163e-01 -5.64502621e+00
  5.19635201e+00 -7.69905376e+00  1.83405602e+00 -3.32394123e+00
 -9.71520042e+00 -2.04885483e+00 -2.16752648e+00 -7.08189821e+00] 1


In [13]:
# Get input for the test
mfcc_x_test, y_test = get_features_labels(test_df, label2id)
print("Examples: ", mfcc_x_test[0], y_test[0])

Examples:  [-246.69847107  150.15864563    7.9820857    29.79780197    4.72329998
   -2.85525489    8.73539639   -7.05086517    1.86267591    9.49735737
   -3.09962344    2.64821887   -7.53379488    5.79352856   -5.47804785
   -1.27932727   -1.44734979   -2.68480325   -6.14129019    1.79987383] 0


In [14]:
multi_x_train = np.concatenate((text_x_train, mfcc_x_train),axis=1)
multi_x_test = np.concatenate((text_x_test, mfcc_x_test),axis=1)

In [16]:
# Scaling our data with sklearn's Standard scaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(multi_x_train)
x_test = scaler.transform(multi_x_test)

In [17]:
# Defining parameter range for SVC
param_grid = {'C': [1,10,100,1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'poly']}

In [18]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 0)
grid.fit(x_train, y_train)

In [19]:
# Print best parameter after tuning
print(grid.best_params_)

# Print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001)


In [20]:
grid_predictions = grid.predict(x_test)
mfcc_predictions = [id2label[label] for label in grid_predictions]
print(classification_report(test_df["label"], mfcc_predictions, digits = 6, zero_division = 0))

              precision    recall  f1-score   support

   no-satire   0.960000  1.000000  0.979592        48
      satire   1.000000  0.958333  0.978723        48

    accuracy                       0.979167        96
   macro avg   0.980000  0.979167  0.979158        96
weighted avg   0.980000  0.979167  0.979158        96



# Generation of the submission file
Finally, an output file is generated with the predictions in the format required for submission to CodaLab.

In [22]:
output_df = pd.DataFrame(columns=["id", "task_1", "task_2"])
output_df["id"] = test_df["id"].str.replace ('.mp3', '', regex = False)
output_df["task_1"] = text_predictions
output_df["task_2"] = mfcc_predictions
print (output_df)
output_df.to_csv ('results_baseline.csv', index = False)

                   id     task_1     task_2
0   2abcfbec-b59cdc92     satire  no-satire
1   9c3f7d29-727fa523     satire     satire
2   0e415924-8b600d71  no-satire  no-satire
3   1f09022d-81190f9b  no-satire  no-satire
4   aa352c05-532b33cc     satire     satire
..                ...        ...        ...
91  3826f75b-ba14c8e5     satire     satire
92  86ae0b91-e42d0ad0     satire     satire
93  4bfde41f-989b722b     satire     satire
94  53b63159-64f00c13  no-satire  no-satire
95  5b97747c-a4aca1b4     satire     satire

[96 rows x 3 columns]
