In [24]:
import pandas as pd

# Load the dataset
file_path = "music_mental.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=["Timestamp", "Permissions"], inplace=True, errors='ignore')

# Convert categorical ordinal variables to numerical values
ordinal_mapping = {
    "Never": 0,
    "Rarely": 1,
    "Sometimes": 2,
    "Very frequently": 3
}

# Apply the mapping to frequency columns
frequency_columns = [col for col in df.columns if "Frequency" in col]
df[frequency_columns] = df[frequency_columns].replace(ordinal_mapping)

# Convert Yes/No columns to binary
binary_columns = ["While working", "Instrumentalist", "Composer", "Exploratory", "Foreign languages"]
df[binary_columns] = df[binary_columns].replace({"Yes": 1, "No": 0})

# Encode categorical columns (Primary streaming service and Fav genre) using one-hot encoding
df = pd.get_dummies(df, columns=["Primary streaming service", "Fav genre"], drop_first=True,dtype = int)

# Convert Music effects column to numerical values
music_effects_mapping = {
    "No effect": 0,
    "Improve": 1,
    "Worsen": -1
}
df["Music effects"] = df["Music effects"].replace(music_effects_mapping)


# Convert numerical columns to proper types
numeric_columns = ["Age", "Hours per day", "BPM", "Anxiety", "Depression", "Insomnia", "OCD"]
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Fill missing values (impute with median for numerical, mode for categorical)
df.fillna(df.median(numeric_only=True), inplace=True)

# Save the cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)

# Display summary of cleaned data
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 49 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Age                                                          736 non-null    float64
 1   Hours per day                                                736 non-null    float64
 2   While working                                                736 non-null    float64
 3   Instrumentalist                                              736 non-null    float64
 4   Composer                                                     736 non-null    float64
 5   Exploratory                                                  736 non-null    int64  
 6   Foreign languages                                            736 non-null    float64
 7   BPM                                                          736 non-null    flo

  df[frequency_columns] = df[frequency_columns].replace(ordinal_mapping)
  df[binary_columns] = df[binary_columns].replace({"Yes": 1, "No": 0})
  df["Music effects"] = df["Music effects"].replace(music_effects_mapping)
