In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.linear_model import LinearRegression, LogisticRegression
import duckdb
import requests
from bs4 import BeautifulSoup
import time
import unicodedata

In [90]:
spotify = pd.read_csv('data.csv')
billboard = pd.read_csv('bill.csv')


In [91]:
# Split 'Artist' by 'feat.' or 'ft.' and combine the feature artist into the 'Artist' column
billboard[['Artist', 'FeatureArtist']] = billboard['Artist'].str.split(r'\s+(?:feat\.|ft\.)\s+', n=1, expand=True)
billboard['Artist'] = billboard.apply(lambda row: f"{row['Artist']}, {row['FeatureArtist']}" if pd.notna(row['FeatureArtist']) else row['Artist'], axis=1) 

billboard.drop(columns=['FeatureArtist'], inplace=True)

# Clean Spotify 'name' and 'artists' columns
spotify['name'] = spotify['name'].str.replace(r'\(feat.\..*?\)', '', regex=True).str.replace(r'\(ft.\..*?\)', '', regex=True).str.strip()
spotify['artists'] = spotify['artists'].str.strip("[]").str.replace("'", "").str.lower().str.translate(str.maketrans\
                                        ('', '', "!\"#$%&'()*+-./:;<=>?@[\\]^_`{|}~")).str.strip()



In [92]:
def remove_accents(text):
    if isinstance(text, str):
        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

billboard['Artist'] = billboard['Artist'].apply(remove_accents) \
                                    .str.lower() \
                                    .str.replace(r'\(.*?\)', '', regex=True) \
                                    .str.replace(r'[^a-z\s]', '', regex=True) \
                                    .str.strip()

billboard['Song'] = billboard['Song'].apply(remove_accents) \
                                  .str.lower() \
                                  .str.replace(r'\(.*?\)', '', regex=True) \
                                  .str.replace(r'[^a-z\s]', '', regex=True) \
                                  .str.strip()

# Clean and normalize 'Artist Name(s)' and 'Track Name' columns in the Spotify dataset
spotify['artists'] = spotify['artists'].apply(remove_accents) \
                                                      .str.lower() \
                                                      .str.replace(r'\(.*?\)', '', regex=True) \
                                                      .str.replace(r'[^a-z\s]', '', regex=True) \
                                                      .str.strip()

spotify['name'] = spotify['name'].apply(remove_accents) \
                                              .str.lower() \
                                              .str.replace(r'\(.*?\)', '', regex=True) \
                                              .str.replace(r'[^a-z\s]', '', regex=True) \
                                              .str.strip()

billboard['Artist'] = billboard['Artist'].str.lower().str.translate(str.maketrans('', '', "!\"#$%&'()*+-./:;<=>?@[\\]^_`{|}~")).str.strip()

billboard['Song'] = billboard['Song'].str.replace('-', ' ') \
                                      .str.replace('/', ' ') \
                                      .str.replace('&', ' ') \
                                      .str.lower().str.translate(str.maketrans('', '', "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")).str.strip()


 

spotify['name'] = spotify['name'].str.replace('-', ' ') \
                              .str.replace('/', ' ') \
                              .str.replace('&', ' ') \
                              .str.lower().str.translate(str.maketrans('', '', "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")).str.strip()

In [93]:
spotify['name'] = spotify['name'].str.lower().str.strip()
spotify['artists'] = spotify['artists'].str.lower().str.strip()


# print(spotify.shape)
# print(spotify.groupby(['name', 'artists']).ngroups) 




In [94]:
top_music = duckdb.sql("""
                        SELECT billboard.Year, billboard.Rank, billboard.Artist, billboard.Song, 
                               MEDIAN(spotify.valence) AS valence, MEDIAN(spotify.danceability) AS danceability, 
                               MEDIAN(spotify.energy) AS energy, MEDIAN(spotify.tempo) AS tempo
                        FROM billboard
                        LEFT JOIN spotify ON billboard.Song = spotify.name AND billboard.Artist = spotify.artists
                        GROUP BY billboard.Year, billboard.Artist, billboard.Song, billboard.Rank
                        ORDER BY billboard.Year, billboard.Rank
                    """).df()

# Drop duplicates and save the result
top_music = top_music.drop_duplicates()

In [95]:
#check to see what values are still missing 

no_match = top_music[top_music['valence'].isna()]
print(no_match[['Year', 'Rank', 'Artist', 'Song']].head())
print(no_match.shape) 


after =duckdb.sql("""
                       SELECT COUNT(*) 
                       FROM no_match
                       WHERE Year < 1980 AND Year >=1970   
                    """).df()

print(after) 

   Year  Rank           Artist                       Song
0  1960     1      percy faith  theme from a summer place
2  1960     3  everly brothers               cathys clown
3  1960     4   johnny preston               running bear
4  1960     5     mark dinning                 teen angel
7  1960     8      jimmy jones                  handy man
(2124, 8)
   count_star()
0           425


In [None]:
no_match.to_csv('no_match.csv', index = False) 
spotify.to_csv('spot.csv', index = False) 
top_music.to_csv('top.csv', index=False)