In [None]:
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option('display.max_rows', 500)

In [None]:
# Step 1: Create a Spark session
spark = SparkSession.builder \
    .appName("Read Parquet Example") \
    .config("spark.master", "local[*]") \
    .getOrCreate()

In [None]:
# Step 2: Define the Parquet file path
parquet_file_path = '/Users/trevor/trevorscholz1/spotify/music-feed-examples/python_example/apple_music_songs/115_part_song_2025-05-14T16-01'

In [None]:
# Step 3: Read the Parquet file
df = spark.read.parquet(parquet_file_path)

In [None]:
df = df.limit(1000).toPandas()

In [None]:
spark.stop()

In [None]:
train = df[['nameDefault','parentalAdvisoryType','primaryArtists','album','releaseDate','durationInMillis','genres']]

In [None]:
def clean_data(dirty):
    dirty = str(dirty)
    clean = dirty.split('name=\'')[-1].split('\')')[0]
    return clean

def get_date(dirty):
    clean = dirty['default']
    return clean

def get_genres(dirty):
    clean = []
    for genre in dirty:
        clean_genre = str(genre).split('name=\'')[-1].split('\'')[0]
        clean.append(clean_genre)
    return clean

In [None]:
train['primaryArtists'] = train['primaryArtists'].apply(clean_data)
train['album'] = train['album'].apply(clean_data)
train['releaseDate'] = train['releaseDate'].apply(get_date)
train['genres'] = train['genres'].apply(get_genres)

In [None]:
train

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# 1. Prepare data

# Convert releaseDate to datetime, extract features like year, month
train['releaseDate'] = pd.to_datetime(train['releaseDate'])
train['year'] = train['releaseDate'].dt.year
train['month'] = train['releaseDate'].dt.month

le_advisory = LabelEncoder()
train['parentalAdvisory_encoded'] = le_advisory.fit_transform(train['parentalAdvisoryType'])

# Encode primaryArtists (optional, can be high cardinality)
le_artist = LabelEncoder()
train['artist_encoded'] = le_artist.fit_transform(train['primaryArtists'])

# Genres: Use MultiLabelBinarizer (since genres is a list)
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(train['genres'])

# Put it all together into a feature matrix
features = pd.DataFrame({
    'durationInMillis': train['durationInMillis'],
    'year': train['year'],
    'month': train['month'],
    'parentalAdvisory': train['parentalAdvisory_encoded'],
    'artist': train['artist_encoded']
})

# Add the genre one-hot columns
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
features = pd.concat([features, genres_df], axis=1)

# Optional: scale the features
scaler = StandardScaler()
X = scaler.fit_transform(features)

# 2. Run KMeans
kmeans = KMeans(n_clusters=5, random_state=42)  # choose clusters number as you see fit
train['cluster'] = kmeans.fit_predict(X)

# Now your train dataframe has a 'cluster' column showing cluster assignment

In [None]:
train