In [1]:
import pandas as pd
import glob
import os

In [2]:
# Pfad zu den Daten definieren
path = "../data/raw/weekly-top-songs-global-2025/*.csv"
all_files = glob.glob(path)

all_dfs_25 = []

for filename in all_files: 
    # CSV einlesen
    df = pd.read_csv(filename, index_col=None, header=0)

    # Datum aus Dateinamen extrahieren
    # Logik: regional-global-weekly-yyyy-mm-dd.csv -> splitte bei '-' und nimm die letzten drei 
    date_str = filename.split('-')[-3:] # ['2025', '01', '04.csv']
    date_str = "-".join(date_str).replace('.csv', '') # 2025-01-04

    # Neue Spalte hinzufügen
    df['chart_week'] = pd.to_datetime(date_str)

    all_dfs_25.append(df)

# Alle DataFrames zu einem großen Set zusammenfügen
df_all = pd.concat(all_dfs_25, axis=0, ignore_index=True)

# Erste Kontrolle
print(f"Datensatz geladen: {df_all.shape[0]} Zeilen aus {len(all_files)} Dateien.")
df_all.head()

Datensatz geladen: 10599 Zeilen aus 53 Dateien.


Unnamed: 0,rank,uri,artist_names,track_name,source,peak_rank,previous_rank,weeks_on_chart,streams,chart_week
0,1,spotify:track:2plbrEY59IikOBgBGLjaoe,"Lady Gaga, Bruno Mars",Die With A Smile,Interscope,1,5,20,67757392,2025-01-02
1,2,spotify:track:4wJ5Qq0jBN4ajy7ouZIV1c,"ROSÉ, Bruno Mars",APT.,Atlantic Records,1,7,11,61219314,2025-01-02
2,3,spotify:track:6dOtVTDdiauQNBQEDOtlAB,Billie Eilish,BIRDS OF A FEATHER,Darkroom/Interscope Records,1,16,33,45331665,2025-01-02
3,4,spotify:track:7ne4VBA60CxGM75vw0EYad,Gracie Abrams,That’s So True,"Gracie Abrams, under exclusive license to Inte...",4,15,11,44318028,2025-01-02
4,5,spotify:track:7tI8dRuH2Yc6RuoTjxo4dU,Jimin,Who,BIGHIT MUSIC,1,22,24,32417756,2025-01-02


In [3]:
# Definieren der relevanten Spalten
keep_columns = [
    'chart_week', 'rank', 'uri', 'artist_names', 
    'track_name', 'peak_rank', 'previous_rank', 
    'weeks_on_chart', 'streams'
]

# Filtern des DataFrames
df_clean_2025 = df_all[keep_columns].copy()

In [8]:
display(df_clean_2025.head())
display(df_clean_2025.info())

Unnamed: 0,chart_week,rank,uri,artist_names,track_name,peak_rank,previous_rank,weeks_on_chart,streams
0,2025-01-02,1,spotify:track:2plbrEY59IikOBgBGLjaoe,"Lady Gaga, Bruno Mars",Die With A Smile,1,5,20,67757392
1,2025-01-02,2,spotify:track:4wJ5Qq0jBN4ajy7ouZIV1c,"ROSÉ, Bruno Mars",APT.,1,7,11,61219314
2,2025-01-02,3,spotify:track:6dOtVTDdiauQNBQEDOtlAB,Billie Eilish,BIRDS OF A FEATHER,1,16,33,45331665
3,2025-01-02,4,spotify:track:7ne4VBA60CxGM75vw0EYad,Gracie Abrams,That’s So True,4,15,11,44318028
4,2025-01-02,5,spotify:track:7tI8dRuH2Yc6RuoTjxo4dU,Jimin,Who,1,22,24,32417756


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10599 entries, 0 to 10598
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   chart_week      10599 non-null  datetime64[ns]
 1   rank            10599 non-null  int64         
 2   uri             10599 non-null  object        
 3   artist_names    10599 non-null  object        
 4   track_name      10599 non-null  object        
 5   peak_rank       10599 non-null  int64         
 6   previous_rank   10599 non-null  int64         
 7   weeks_on_chart  10599 non-null  int64         
 8   streams         10599 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 745.4+ KB


None

In [10]:
# Als CSV speichern
# Zielverzeichnis definieren
output_dir = '../data/processed'
output_file = os.path.join(output_dir, 'df_cleaned_2025.csv')

# Ordner erstellen, falls er noch nicht existiert
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Ordner erstellt: {output_dir}")

# DataFrame als CSV speichern
df_clean_2025.to_csv(output_file, index=False, encoding='utf-8')
print(f"Datei erfolgreich gespeichert unter: {output_file}")

Datei erfolgreich gespeichert unter: ../data/processed/df_cleaned_2025.csv
