In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tqdm import tqdm

# Directory containing the anthem files
anthems_dir = "/kaggle/input/national-anthem-sound-dataset"
results = []

# Process each anthem file
for filename in tqdm(os.listdir(anthems_dir)):
    if filename.endswith(".mp3"):
        filepath = os.path.join(anthems_dir, filename)
        try:
            # Load the audio file
            y, sr = librosa.load(filepath, sr=None)
            
            # Extract country name from filename (assuming format like "USA_anthem.mp3")
            country = filename.split('_')[0]
            
            # Basic audio properties
            duration = librosa.get_duration(y=y, sr=sr)
            
            # Extract features
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(y=y)
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            
            # Calculate statistics
            chroma_mean = np.mean(chroma)
            centroid_mean = np.mean(spectral_centroid)
            rolloff_mean = np.mean(spectral_rolloff)
            zcr_mean = np.mean(zero_crossing_rate)
            mfcc_means = np.mean(mfccs, axis=1)
            
            # Calculate rhythm strength
            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
            rhythm_strength = np.mean(onset_env)
            
            # Calculate harmonic-percussive ratio
            y_harmonic, y_percussive = librosa.effects.hpss(y)
            harmonic_percussive_ratio = np.sum(np.abs(y_harmonic)) / np.sum(np.abs(y_percussive))
            
            # Store results
            results.append({
                'country': country,
                'duration': duration,
                'tempo': tempo,
                'chroma_mean': chroma_mean,
                'centroid_mean': centroid_mean,
                'rolloff_mean': rolloff_mean,
                'zcr_mean': zcr_mean,
                'rhythm_strength': rhythm_strength,
                'harmonic_percussive_ratio': harmonic_percussive_ratio,
                'mfcc1': mfcc_means[0],
                'mfcc2': mfcc_means[1],
                'mfcc3': mfcc_means[2],
                'mfcc4': mfcc_means[3],
                'mfcc5': mfcc_means[4]
            })
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Convert to DataFrame
df = pd.DataFrame(results)
df.to_csv("anthem_analysis.csv", index=False)

100%|██████████| 100/100 [23:13<00:00, 13.93s/it]


In [8]:
# Continue from previous script
# Basic statistics
print("Summary statistics:")
print(df.describe())

# Correlations
print("\nCorrelation matrix:")
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
print(correlation_matrix)

# Clustering
features = ['tempo', 'centroid_mean', 'rolloff_mean', 'zcr_mean', 
            'rhythm_strength', 'harmonic_percussive_ratio',
            'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5']
X = df[features].values

# Normalize features
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)

# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Determine optimal number of clusters
from sklearn.metrics import silhouette_score
silhouette_scores = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_scores.append(silhouette_score(X_scaled, cluster_labels))

optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"\nOptimal number of clusters: {optimal_k}")

# Apply K-means with optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Plot the clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['cluster'], cmap='viridis', alpha=0.6)
plt.title('National Anthem Clusters based on Audio Features')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

# Add country labels
for i, country in enumerate(df['country']):
    plt.annotate(country, (X_pca[i, 0], X_pca[i, 1]), fontsize=8)

plt.colorbar(scatter, label='Cluster')
plt.savefig('anthem_clusters.png')
plt.close()

# Analyze characteristics of each cluster
print("\nCluster analysis:")
for cluster in range(optimal_k):
    cluster_anthems = df[df['cluster'] == cluster]
    print(f"\nCluster {cluster} - {len(cluster_anthems)} anthems")
    print("Countries:", ", ".join(cluster_anthems['country'].tolist()))
    print("Average characteristics:")
    for feature in features:
        # Check if the value is a scalar or array
        mean_value = cluster_anthems[feature].mean()
        if isinstance(mean_value, np.ndarray):
            # For array values, print first few elements
            print(f"  {feature}: {mean_value[:3]}...")
        else:
            # For scalar values, format with 2 decimal places
            print(f"  {feature}: {mean_value:.2f}")

# Regional analysis
# Assuming country names can be mapped to continents (you'd need to add this mapping)
continent_map = {
    # Add your mappings here, e.g., 'USA': 'North America', 'France': 'Europe', etc.
}

# Add continent column if you have the mapping
if continent_map:
    df['continent'] = df['country'].map(continent_map)
    
    # Plot features by continent
    for feature in ['tempo', 'duration', 'harmonic_percussive_ratio']:
        plt.figure(figsize=(10, 6))
        df.boxplot(column=feature, by='continent')
        plt.title(f'{feature} by Continent')
        plt.suptitle('')
        plt.savefig(f'{feature}_by_continent.png')
        plt.close()

Summary statistics:
         duration  chroma_mean  centroid_mean  rolloff_mean    zcr_mean  \
count  100.000000   100.000000     100.000000    100.000000  100.000000   
mean    81.779284     0.351368    1717.036340   3294.458904    0.046718   
std     34.135738     0.028317     434.120002    987.987502    0.010024   
min     29.648980     0.280847     929.119155   1616.349821    0.027781   
25%     57.593469     0.330477    1406.361731   2584.570035    0.039278   
50%     74.540408     0.352312    1647.174459   3117.416224    0.044554   
75%     99.617959     0.368098    2007.752263   3948.828069    0.052937   
max    213.342041     0.416764    2749.653069   5649.751232    0.078394   

       rhythm_strength  harmonic_percussive_ratio       mfcc1       mfcc2  \
count       100.000000                 100.000000  100.000000  100.000000   
mean          0.842743                   3.922095 -248.512924  183.486938   
std           0.095282                   1.232348   51.580925   20.702450




Optimal number of clusters: 2

Cluster analysis:

Cluster 0 - 36 anthems
Countries: gw.mp3, mc.mp3, kp.mp3, ke.mp3, bj.mp3, bz.mp3, gg.mp3, ki.mp3, bb.mp3, bd.mp3, es.mp3, fk.mp3, md.mp3, gu.mp3, et.mp3, bh.mp3, as.mp3, bs.mp3, eu.mp3, kw.mp3, be.mp3, ao.mp3, bl.mp3, fr.mp3, by.mp3, bn.mp3, jo.mp3, ie.mp3, bw.mp3, id.mp3, at.mp3, ir.mp3, bo.mp3, kg.mp3, er.mp3, gh.mp3
Average characteristics:
  tempo: [103.80074742]...
  centroid_mean: 2189.77
  rolloff_mean: 4387.92
  zcr_mean: 0.06
  rhythm_strength: 0.90
  harmonic_percussive_ratio: 3.36
  mfcc1: -211.63
  mfcc2: 166.03
  mfcc3: -42.82
  mfcc4: 33.65
  mfcc5: -15.99

Cluster 1 - 64 anthems
Countries: az.mp3, ge.mp3, gd.mp3, iq.mp3, gy.mp3, kr.mp3, gr.mp3, kz.mp3, bf.mp3, je.mp3, il.mp3, dz.mp3, ga.mp3, kn.mp3, do.mp3, dm.mp3, cd.mp3, bm.mp3, br.mp3, it.mp3, af.mp3, gn.mp3, ai.mp3, dj.mp3, cz.mp3, fo.mp3, gb.mp3, aw.mp3, gq.mp3, ad.mp3, ae.mp3, cf.mp3, ba.mp3, jp.mp3, eg.mp3, ma.mp3, km.mp3, im.mp3, ee.mp3, bi.mp3, jm.mp3, ax.mp3, g

In [3]:
# Function to visualize an anthem
def visualize_anthem(filepath, country_name):
    y, sr = librosa.load(filepath, sr=None)
    
    plt.figure(figsize=(15, 10))
    
    # Plot waveform
    plt.subplot(3, 1, 1)
    librosa.display.waveshow(y, sr=sr)
    plt.title(f"{country_name} National Anthem - Waveform")
    
    # Plot spectrogram
    plt.subplot(3, 1, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    
    # Plot chromagram
    plt.subplot(3, 1, 3)
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
    librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
    plt.colorbar()
    plt.title('Chromagram')
    
    plt.tight_layout()
    plt.savefig(f"{country_name}_anthem_analysis.png")
    plt.close()

# Visualize a few random anthems
import random
sample_anthems = random.sample(list(df['country']), 5)
for country in sample_anthems:
    filename = [f for f in os.listdir(anthems_dir) if f.startswith(country)][0]
    filepath = os.path.join(anthems_dir, filename)
    visualize_anthem(filepath, country)

In [4]:
# Function to analyze rhythm and melody patterns
def analyze_patterns(df, anthems_dir):
    rhythm_patterns = {}
    scale_patterns = {}
    
    for index, row in df.iterrows():
        country = row['country']
        filename = [f for f in os.listdir(anthems_dir) if f.startswith(country)][0]
        filepath = os.path.join(anthems_dir, filename)
        
        # Load audio
        y, sr = librosa.load(filepath, sr=None)
        
        # Rhythm pattern analysis
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
        
        if len(beats) >= 2:
            # Calculate beat intervals
            beat_intervals = np.diff(beats)
            # Categorize rhythm pattern
            if np.std(beat_intervals) < 0.1:
                rhythm_pattern = "Regular/March-like"
            elif np.mean(beat_intervals) > sr * 0.5:
                rhythm_pattern = "Slow/Stately"
            else:
                rhythm_pattern = "Variable"
        else:
            rhythm_pattern = "Undefined"
        
        rhythm_patterns[country] = rhythm_pattern
        
        # Key and scale analysis
        chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
        chroma_sum = np.sum(chroma, axis=1)
        key = np.argmax(chroma_sum)
        key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
        
        # Determine if major or minor based on relative presence of major and minor third
        major_third = (key + 4) % 12
        minor_third = (key + 3) % 12
        if chroma_sum[major_third] > chroma_sum[minor_third]:
            scale = "Major"
        else:
            scale = "Minor"
            
        scale_patterns[country] = f"{key_names[key]} {scale}"
    
    # Add to dataframe
    df['rhythm_pattern'] = df['country'].map(rhythm_patterns)
    df['key_and_scale'] = df['country'].map(scale_patterns)
    
    # Analysis of patterns
    print("\nRhythm Pattern Distribution:")
    print(df['rhythm_pattern'].value_counts())
    
    print("\nKey and Scale Distribution:")
    print(df['key_and_scale'].value_counts().head(10))  # Top 10 most common keys
    
    return df

# Enhance dataframe with pattern analysis
df = analyze_patterns(df, anthems_dir)


Rhythm Pattern Distribution:
rhythm_pattern
Variable    100
Name: count, dtype: int64

Key and Scale Distribution:
key_and_scale
F Major     27
D# Major    17
C Major     14
A# Major     9
D Minor      7
A Minor      4
G Major      3
F Minor      3
C Minor      3
E Minor      3
Name: count, dtype: int64


In [9]:
# You would need to add data about when each anthem was composed
# This is just a conceptual example

# Assuming you have added 'year_composed' to your dataframe
if 'year_composed' in df.columns:
    # Categorize by historical period
    def get_period(year):
        if year < 1800:
            return "Pre-1800"
        elif year < 1900:
            return "19th Century"
        elif year < 1950:
            return "Early 20th Century"
        elif year < 2000:
            return "Late 20th Century"
        else:
            return "21st Century"
            
    df['historical_period'] = df['year_composed'].apply(get_period)
    
    # Analyze features by historical period
    for feature in ['tempo', 'harmonic_percussive_ratio', 'duration']:
        plt.figure(figsize=(10, 6))
        df.boxplot(column=feature, by='historical_period')
        plt.title(f'{feature} by Historical Period')
        plt.suptitle('')
        plt.savefig(f'{feature}_by_period.png')
        plt.close()
        
    # Look for trends in key usage over time
    period_key_counts = df.groupby(['historical_period', 'key_and_scale']).size().unstack(fill_value=0)
    period_key_counts.plot(kind='bar', stacked=True, figsize=(12, 8))
    plt.title('Key Usage by Historical Period')
    plt.ylabel('Number of Anthems')
    plt.tight_layout()
    plt.savefig('key_usage_by_period.png')
    plt.close()