In [1]:
import pandas as pd
import glob
import os

In [2]:
# Pfad zu den Daten definieren
path = "../data/raw/weekly-top-songs-global-2024/*.csv"
all_files = glob.glob(path)

all_dfs_24 = []

for filename in all_files: 
    # CSV einlesen
    df = pd.read_csv(filename, index_col=None, header=0)

    # Datum aus Dateinamen extrahieren
    # Logik: regional-global-weekly-yyyy-mm-dd.csv -> splitte bei '-' und nimm die letzten drei 
    date_str = filename.split('-')[-3:] # ['2024', '01', '04.csv']
    date_str = "-".join(date_str).replace('.csv', '') # 2024-01-04

    # Neue Spalte hinzufügen
    df['chart_week'] = pd.to_datetime(date_str)

    all_dfs_24.append(df)

# Alle DataFrames zu einem großen Set zusammenfügen
df_all = pd.concat(all_dfs_24, axis=0, ignore_index=True)

# Erste Kontrolle
print(f"Datensatz geladen: {df_all.shape[0]} Zeilen aus {len(all_files)} Dateien.")
df_all.head()

Datensatz geladen: 10400 Zeilen aus 52 Dateien.


Unnamed: 0,rank,uri,artist_names,track_name,source,peak_rank,previous_rank,weeks_on_chart,streams,chart_week
0,1,spotify:track:3rUGC1vUpkDG9CZFHMur1t,Tate McRae,greedy,RCA Records Label,1,14,16,33855816,2024-01-04
1,2,spotify:track:0R6NfOiLzLj4O5VbYSJAjf,Xavi,La Diabla,Interscope Records,2,17,4,30894083,2024-01-04
2,3,spotify:track:4xhsWYTOGcal8zt0J161CU,Jack Harlow,Lovin On Me,Generation Now/Atlantic,3,19,8,30778444,2024-01-04
3,4,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,Taylor Swift,Cruel Summer,Taylor Swift,2,20,48,30224692,2024-01-04
4,5,spotify:track:3vkCueOmm7xQDoJ17W1Pm3,Mitski,My Love Mine All Mine,Dead Oceans,3,18,15,26430016,2024-01-04


In [3]:
# Definieren der relevanten Spalten
keep_columns = [
    'chart_week', 'rank', 'uri', 'artist_names', 
    'track_name', 'peak_rank', 'previous_rank', 
    'weeks_on_chart', 'streams'
]

# Filtern des DataFrames
df_clean_2024 = df_all[keep_columns].copy()

In [4]:
display(df_clean_2024.head())
display(df_clean_2024.info())

Unnamed: 0,chart_week,rank,uri,artist_names,track_name,peak_rank,previous_rank,weeks_on_chart,streams
0,2024-01-04,1,spotify:track:3rUGC1vUpkDG9CZFHMur1t,Tate McRae,greedy,1,14,16,33855816
1,2024-01-04,2,spotify:track:0R6NfOiLzLj4O5VbYSJAjf,Xavi,La Diabla,2,17,4,30894083
2,2024-01-04,3,spotify:track:4xhsWYTOGcal8zt0J161CU,Jack Harlow,Lovin On Me,3,19,8,30778444
3,2024-01-04,4,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,Taylor Swift,Cruel Summer,2,20,48,30224692
4,2024-01-04,5,spotify:track:3vkCueOmm7xQDoJ17W1Pm3,Mitski,My Love Mine All Mine,3,18,15,26430016


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10400 entries, 0 to 10399
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   chart_week      10400 non-null  datetime64[ns]
 1   rank            10400 non-null  int64         
 2   uri             10400 non-null  object        
 3   artist_names    10400 non-null  object        
 4   track_name      10400 non-null  object        
 5   peak_rank       10400 non-null  int64         
 6   previous_rank   10400 non-null  int64         
 7   weeks_on_chart  10400 non-null  int64         
 8   streams         10400 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 731.4+ KB


None

In [5]:
# Als CSV speichern
# Zielverzeichnis definieren
output_dir = '../data/interim'
output_file = os.path.join(output_dir, 'df_cleaned_2024.csv')

# Ordner erstellen, falls er noch nicht existiert
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Ordner erstellt: {output_dir}")

# DataFrame als CSV speichern
df_clean_2024.to_csv(output_file, index=False, encoding='utf-8')
print(f"Datei erfolgreich gespeichert unter: {output_file}")

Ordner erstellt: ../data/interim
Datei erfolgreich gespeichert unter: ../data/interim/df_cleaned_2024.csv
