In [0]:
# This notebook was for acquring the track details for all the pickels in the dataset

# Setting up general imports and boto3

In [0]:
# General Setup
import pandas as pd
import numpy as np
# !pip install boto3
import boto3
import os
import time
import pickle

# AWS Credentials and Settings
access_key = 'ACCESS_KEY'
secret_key = 'SECRET_ACCESS_KEY'

os.environ['AWS_ACCESS_KEY_ID'] = access_key
os.environ['AWS_SECRET_ACCESS_KEY'] = secret_key
encoded_secret_key = secret_key.replace("/", "%2F").replace("+", "%2B")

aws_region = 'us-east-1'

s3 = boto3.client(
    service_name='s3',
    region_name=aws_region,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)



#Setting up Spark

In [0]:
# Spark Setup
# Setting up Spark
# !pip install pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession, Row, Window
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, explode, col, collect_list, regexp_replace, split, expr, length, concat_ws, count, size, first
from pyspark.sql.types import ArrayType, StringType, IntegerType, FloatType, DoubleType
from pyspark import sql
import pyspark.pandas as ps

# Set up Spark
spark = SparkSession.builder \
    .appName("PicklesPlus") \
    .config("spark.hadoop.fs.s3a.access.key", access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.hadoop.fs.s3a.endpoint","s3." + aws_region + ".amazonaws.com") \
    .config("spark.executor.memory", "15g") \
    .config("spark.executor.cores", "2") \
    .config("spark.default.parallelism", "4") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

#Importing ML stuff

In [0]:
# Importing ML libraries
from pyspark.ml.stat import Correlation, ChiSquareTest, Summarizer

from pyspark.ml.feature import StringIndexer, Tokenizer, HashingTF, IDF, VectorAssembler, StandardScaler

from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Load in data

In [0]:
main = spark.read.parquet('s3a://kagglespotify6k/raw/main_dataset.parquet/')

tracks_parq = spark.read.parquet('s3a://kagglespotify6k/raw/pickles/pickle_tracks_test.parquet/')

# main contains some duplicate columns
columns_to_drop = ['key', 'loudness', 'mode', 'tempo', 'time_signature']
main_merge = main.drop(*columns_to_drop)

# Join the 2 dataframes
merged_df = tracks_parq.join(main_merge, tracks_parq['track_name'] == main_merge['track_uri'])

# Getting all the pickles

In [0]:
all_tracks = main.select(col('track_uri')).collect()
track_list = [row['track_uri'] for row in all_tracks]
present_tracks = merged_df.select(col('track_uri')).collect()
present_list = [row2['track_uri'] for row2 in present_tracks]

missing_tracks = [track for track in track_list if track not in present_list]

print(len(track_list))
print(len(present_list))
print(len(missing_tracks))
print(len(track_list) - len(present_list))

277923
30544
247379
247379


In [0]:
# test_set = missing_tracks[0:5000]

track_attr = ['duration','loudness','tempo','tempo_confidence','time_signature','time_signature_confidence','key','key_confidence','mode','mode_confidence']

# Attempt 1
def get_track(pk_file):
    s3 = boto3.client('s3')
    curr_pickle = s3.get_object(Bucket='kagglespotify6k', Key=f'landing/Cleaned Analyses/Cleaned Analyses/{pk_file}.pickle')
    pk_read = pickle.loads(curr_pickle['Body'].read())
    track_data = pk_read['track']
    filtered_track = {key: track_data[key] for key in track_attr if key in track_data}
    filtered_track['track_uri'] = pk_file
    return(filtered_track)

# test_rdd = spark.sparkContext.parallelize(test_set)
# track_gets = test_rdd.map(get_track)
# fin_track = track_gets.collect()
# rows = [Row(**data) for data in fin_track]
# df = spark.createDataFrame(rows)
# df.show()

# Took 4 seconds for 10 records
# 14.78 seconds for 100 records - 6.76 per second
# 1.61 minutes for 1000 records - 10.35 per second
# 25.41 minutes for 10,000 records - 6.55 per second
# 7.74 minutes for 5000 records - 10.76 per second

# Instead of making a data frame each time or shoving it into a dataframe for each iteration
# Store it all in a dictionary or list, then create a dataframe at the en

In [0]:
missing_section_1 = [missing_tracks[0:5000], missing_tracks[5000:10000], missing_tracks[10000:15000], missing_tracks[15000:20000], missing_tracks[20000:25000], missing_tracks[25000:30000]]

missing_section_2 = [missing_tracks[30000:35000], missing_tracks[35000:40000], missing_tracks[40000:45000], missing_tracks[45000:50000], missing_tracks[50000:55000], missing_tracks[55000:60000]]

missing_section_3 = [missing_tracks[60000:65000], missing_tracks[65000:70000], missing_tracks[70000:75000], missing_tracks[75000:80000], missing_tracks[80000:85000], missing_tracks[85000:90000]]

missing_section_4 = [missing_tracks[90000:95000], missing_tracks[95000:100000], missing_tracks[100000:105000], missing_tracks[105000:110000], missing_tracks[110000:115000], missing_tracks[115000:120000]]

missing_section_5 = [missing_tracks[120000:125000], missing_tracks[125000:130000], missing_tracks[130000:135000], missing_tracks[135000:140000], missing_tracks[140000:145000], missing_tracks[145000:150000]]

missing_section_6 = [missing_tracks[150000:155000], missing_tracks[155000:160000], missing_tracks[160000:165000], missing_tracks[165000:170000], missing_tracks[170000:175000], missing_tracks[175000:180000]]

missing_section_7 = [missing_tracks[180000:185000], missing_tracks[185000:190000], missing_tracks[190000:195000], missing_tracks[195000:200000], missing_tracks[200000:205000], missing_tracks[205000:210000]]

missing_section_8 = [missing_tracks[210000:215000], missing_tracks[220000:225000], missing_tracks[230000:235000], missing_tracks[235000:240000], missing_tracks[240000:245000], missing_tracks[245000:247379]]

In [0]:
fin_track = []

In [0]:


for section in missing_section_1:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 47.31 minutes - ~30,000 records

In [0]:
for section in missing_section_2:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 46.60 minutes - ~30,000 records

In [0]:
for section in missing_section_3:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 47.61 minutes - ~30,000 records

In [0]:
rows = [Row(**data) for data in fin_track]
missing_tracks_df = spark.createDataFrame(rows)
missing_tracks_df.show(5)

+---------+--------+-------+----------------+--------------+-------------------------+---+--------------+----+---------------+--------------------+
| duration|loudness|  tempo|tempo_confidence|time_signature|time_signature_confidence|key|key_confidence|mode|mode_confidence|           track_uri|
+---------+--------+-------+----------------+--------------+-------------------------+---+--------------+----+---------------+--------------------+
|217.57333|  -8.029|170.044|            0.05|             4|                      1.0|  0|         0.601|   0|          0.574|2CY92qejUrhyPUASa...|
|    253.0|  -4.999| 75.003|           0.195|             4|                    0.627|  0|         0.494|   1|          0.473|73xsMXuRNB3yqLeNc...|
| 216.1868|  -4.842| 83.571|           0.564|             4|                     0.99|  7|         0.061|   1|          0.405|6TwrBbgTaB5gpl06Y...|
| 123.6298|  -4.233|182.912|           0.143|             4|                      1.0|  7|         0.712|   0|  

In [0]:
missing_tracks_df.write.mode('overwrite').parquet('s3a://kagglespotify6k/trusted/tracks_data_0_to_90k')

In [0]:
for section in missing_section_4:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 49.03 minutes - 30k records
# 47.24 minutes - 30k records
# 46.18 minutes

In [0]:
for section in missing_section_5:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 56.86 minutes

In [0]:
 for section in missing_section_6:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 51.50 minutes

In [0]:
for section in missing_section_7:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 47.49 minutes

In [0]:
for section in missing_section_8:
    list_rdd = spark.sparkContext.parallelize(section)
    track_gets = list_rdd.map(get_track)
    fin_track.extend(track_gets.collect())

# 46.10 minutes

In [0]:
len(fin_track)

Out[16]: 147379

In [0]:
rows2 = [Row(**data) for data in fin_track]
missing_tracks_df2 = spark.createDataFrame(rows2)
missing_tracks_df2.show(5)

+---------+--------+-------+----------------+--------------+-------------------------+---+--------------+----+---------------+--------------------+
| duration|loudness|  tempo|tempo_confidence|time_signature|time_signature_confidence|key|key_confidence|mode|mode_confidence|           track_uri|
+---------+--------+-------+----------------+--------------+-------------------------+---+--------------+----+---------------+--------------------+
|425.33334|  -6.448|123.557|            0.29|             4|                    0.974|  3|         0.458|   1|          0.638|28Ri5diNoXNl0s3Ew...|
|379.35126|  -4.573|171.812|           0.134|             4|                    0.732|  9|         0.752|   1|          0.503|6P5inyUmrJuslHrIN...|
|213.70667|  -6.182|136.034|           0.857|             4|                    0.989|  6|         0.631|   1|           0.43|3yAw3cFOPzUPkklui...|
|199.83887|  -4.064|115.825|           0.163|             4|                      1.0|  9|         0.682|   1|  

In [0]:
missing_tracks_df2.write.mode('overwrite').parquet('s3a://kagglespotify6k/trusted/tracks_data_90k_to_270k')