In [140]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import json
import numpy as np
from scipy.stats import normaltest,kruskal,chi2_contingency


## Load json files  for train/val/test sets and initalised audio file paths

In [123]:
train_json_path =  r"D:\Nsynth dataset\train\nsynth-train\examples.json"
val_json_path = r"D:\Nsynth dataset\val\nsynth-valid\examples.json"
test_json_path = r"D:\Nsynth dataset\test\nsynth-test\examples.json"

train_audio_folder_path = r"D:\Nsynth dataset\train\nsynth-train\audio"
val_audio_folder_path = r"D:\Nsynth dataset\val\nsynth-valid\audio"
test_audio_folder_path = r"D:\Nsynth dataset\test\nsynth-test\audio"



## Read the json file and see if labels are consistent with json file (Data quality check)

In [124]:

 
 #Load the json file and convert to dictionary
training_dict = json.load(open(train_json_path))
validation_dict = json.load(open(val_json_path))
testing_dict = json.load(open(test_json_path))


#Check the audio files in the audio folder
training_audio_files = os.listdir(train_audio_folder_path)
validation_audio_files = os.listdir(val_audio_folder_path)
testing_audio_files = os.listdir(test_audio_folder_path)



In [None]:
def check_audio_files(audio_label_dict, audio_folder_path):
    #Check if the number of labels and audio files are the same
    no_labels = len(audio_label_dict)
    no_audio_files = len(audio_folder_path)
    if no_labels != no_audio_files:
        print(f"Inconsistent number of labels and audio files")
    
    #Count the number of files with different extensions
    ext_counts = {}
    #Check if the labels are consistent with the audio files
    for audiofile in audio_folder_path:
        audio_file,ext = os.path.splitext(audiofile)
        if ext not in ext_counts:
            ext_counts[ext] = 0
        ext_counts[ext] += 1

        if audio_file not in audio_label_dict:
            print(f"Audio file {audio_file} not in json file")
            print("Test failed")
            return False
        
    print("Test passed")
    print([(key,f"{values} files") for key,values in ext_counts.items()])
    return True




check_audio_files(training_dict, training_audio_files)  
check_audio_files(validation_dict, validation_audio_files)
check_audio_files(testing_dict, testing_audio_files)

    



## Convert to dict -> dataframe and conduct descriptive analysis

In [126]:
# Convert dict to dataframe
training_df = pd.DataFrame(training_dict).transpose()
validation_df = pd.DataFrame(validation_dict).transpose()
testing_df = pd.DataFrame(testing_dict).transpose()


#Set the index name to sample
training_df.index.name,validation_df.index.name,testing_df.index.name = "sample","sample","sample"


In [None]:
#Check if the number of columns are the same
print(f"Number of columns: {training_df.shape[1]}" if training_df.shape[1] == validation_df.shape[1] == testing_df.shape[1] else 
      "Number of columns not equal in training, validation, and testing sets")
#Check the column names
print(f"The columns are: {training_df.columns}")


print(f"The training set has {training_df.isnull().sum().sum()} null values")
print(f"The validation set has {validation_df.isnull().sum().sum()} null values")
print(f"The testing set has {testing_df.isnull().sum().sum()} null values")







In [128]:
# Get unique instrument counts as sets for easy comparison
training_counts = training_df["instrument_str"].value_counts().index
validation_counts = validation_df["instrument_str"].value_counts().index
testing_counts = testing_df["instrument_str"].value_counts().index

# Check if a value in training is missing from validation or testing
for value in training_counts:
    if value in validation_counts and value in testing_counts:
        print(value)


#If none is output, then the instrument_str is consistent in all three sets as training should not contain any instrument_str that is in validation or testing

In [None]:
training_counts = training_df["instrument"].value_counts().index
validation_counts = validation_df["instrument"].value_counts().index
testing_counts = testing_df["instrument"].value_counts().index

# Check if a value in training is missing from validation or testing
for value in training_counts:
    if value in validation_counts or value in testing_counts:
        print(value)


print((f"The training set has {len(training_counts)} unique instruments" ))
print((f"The validation set has {len(validation_counts)} unique instruments"))
print((f"The testing set has {len(testing_counts)} unique instruments"))


if set(validation_df["instrument"]) == (set(testing_df["instrument"])):
    print("The validation and testing sets have the same instruments")
else:
    print("The validation and testing sets have different instruments")






In [None]:


#Analysing pitch in training set
pitch = training_df['pitch']

print(f"The minimum pitch is {pitch.min()}")
print(f"The maximum pitch is {pitch.max()}")
print(f"The mean pitch is {pitch.mean()}")
print(f"The median pitch is {pitch.median()}")
print(f"The standard deviation of the pitch is {pitch.std()}")


#Plotting the pitch distribution
plt.figure(figsize=(10,5))
plt.hist(pitch, bins=range(0, 130, 1), edgecolor='black', alpha=0.7)
plt.title('Pitch Distribution in training seT')
plt.xlabel('Pitch')
plt.ylabel('Frequency')
plt.show()      


#Notice that the pitch is symmetrically distributed around 60(ignoring the outliers)


#Looking at the outliers can see in the visuale
pitch_counts = pitch.value_counts()
print(pitch_counts[:30])






In [None]:
#Correlation between instrument_family and pitch(not the best way to do it but quick way check)
instrument_family = training_df['instrument_family']
correlation_value = instrument_family.corr(pitch)
print(f"Correlation between instrument_family and pitch: {correlation_value}")

training_df['pitch'] = training_df['pitch'].astype(float)

#Statistical test to check if the pitch is different for different instrument_family

# Perform D'Agostino's K-squared test
stat, p = normaltest(training_df['pitch'])

print(f"D’Agostino-K² Test Statistic: {stat:.4f}")
print(f"D’Agostino-K² p-value: {p:.4e}")

if p < 0.05:
    print("Data is NOT normally distributed ❌")
else:
    print("Data is normally distributed ✅")


#As pitch not normally distributed, we hae to use non-parametric test

# Perform Kruskal-Wallis test for all instrument families
kruskal_stat, kruskal_p = kruskal(
    *[training_df.loc[training_df["instrument_family_str"] == fam, "pitch"]
      for fam in training_df["instrument_family_str"].unique()]
)

print(f"Kruskal-Wallis Test Statistic: {kruskal_stat:.4f}")
print(f"Kruskal-Wallis Test p-value: {kruskal_p:.4e}")

if kruskal_p < 0.05:
    print("Significant difference in pitch across instrument families ❌")
else:
    print("No significant difference in pitch across instrument families ✅")






plt.figure(figsize=(12, 6))
sns.violinplot(x="velocity", y="pitch", data=training_df)
plt.xlabel("Velocity")
plt.ylabel("Pitch")
plt.title("Pitch Density Across Velocity Levels for training set")
plt.show()


In [None]:
# Correlation pitch and velocity
training_df['velocity'] = training_df['velocity'].astype(float)
velocity = training_df['velocity']

print(f"The velocity values are quite evenly distributed: {velocity.value_counts()}")
correlation_value = velocity.corr(pitch)
print(f"Correlation between velocity and pitch: {correlation_value}")


plt.figure(figsize=(12, 6))
sns.violinplot(x="velocity", y="pitch", data=training_df)
plt.xlabel("Velocity")
plt.ylabel("Pitch")
plt.title("Pitch Density Across Velocity Levels")
plt.show()

#Can see visually that pitch and velocity are not correlated and the distrbution of pitch is similar across all velocity levels




In [None]:
#Want to see if qualities are different for different instrument families

# Convert qualities list into separate columns (one-hot encoding)
qualities_df = training_df["qualities"].apply(pd.Series)
qualities_df.columns = [f"quality_{i}" for i in range(qualities_df.shape[1])]

# Add instrument family column for comparison
qualities_df["instrument_family"] = training_df["instrument_family"]

# Count the occurrences of each quality for each instrument family
quality_counts = qualities_df.groupby("instrument_family").sum()


print(quality_counts)


# Run chi-square test on the quality counts
chi2_stat, p, dof, expected = chi2_contingency(quality_counts)

print(f"Chi-Square Test Statistic: {chi2_stat:.4f}")
print(f"Chi-Square Test p-value: {p:.4e}")

# Interpretation
if p < 0.05:
    print("There is a significant relationship between qualities and instrument families ❌")
else:
    print("No significant relationship between qualities and instrument families ✅")


In [None]:
training_df

In [None]:
print("Hello")