In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import regex as re

In [2]:
# 2. find url and store it in a variable
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
# 3. download html with a get request
response = requests.get(url)

In [4]:
response.status_code # 200 status code means OK!

200

In [5]:
#response.content

In [6]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
# 4.2. check that the html code looks like it should
#soup

In [8]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector" you copied before to get the element that belongs to the top movie)
len(soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div "))

105

In [9]:
artist = []
song = []
genre = []
song_info = []

for song in soup.find_all('p', class_='title-artist'):
    song_name = song.find('cite', class_='title').get_text()
    artist_name = song.find('em', class_='artist').get_text()
    song_info.append((song_name, artist_name))
    
#song_info    

In [10]:
# each list becomes a column
top_100 = pd.DataFrame(song_info, columns=['Song', 'Artist'])

top_100

Unnamed: 0,Song,Artist
0,Lovin On Me,Jack Harlow
1,Lil Boo Thang,Paul Russell
2,I Remember Everything (feat. Kacey Musgraves),Zach Bryan
3,White Horse,Chris Stapleton
4,Save Me (with Lainey Wilson),Jelly Roll
...,...,...
95,23,Chayce Beckham
96,This Wish,Ariana DeBose & Disney
97,Everything I Love,Morgan Wallen
98,G6,2 Chainz & Lil Wayne


In [11]:
url = "https://www.popvortex.com/music/charts/top-100-albums.php"
response = requests.get(url)

In [12]:
soup = BeautifulSoup(response.content, "html.parser")

In [13]:
artist = []
album = []
album_info = []

for album in soup.find_all('p', class_='title-artist'):
    album_name = album.find('cite', class_='title').get_text()
    artist_name = album.find('em', class_='artist').get_text()
    album_info.append((album_name, artist_name))

In [14]:
# each list becomes a column
top_album = pd.DataFrame(album_info, columns=['Song', 'Artist'])

#top_album

Unnamed: 0,Song,Artist
0,Rockstar,Dolly Parton
1,Higher,Chris Stapleton
2,New Blue Sun,André 3000
3,1989 (Taylor's Version) [Deluxe],Taylor Swift
4,Hannah Waddingham: Home For Christmas (Soundtr...,Hannah Waddingham
...,...,...
95,The Lost Christmas Eve,Trans-Siberian Orchestra
96,Chronicle: The 20 Greatest Hits,Creedence Clearwater Revival
97,The Classic Christmas Album,Elvis Presley
98,Hot Rocks 1964-1971,The Rolling Stones


### Wikipedia - List of Songs Recorded by Maroon 5

In [18]:
url = "https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Maroon_5"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [19]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
#soup

In [20]:
table = soup.find('table', class_='wikitable')
#print(table)

In [21]:
# Iterate through rows in the table
maroon5_songs = []

for row in table.find_all('tr')[1:]:
    columns = row.find_all(['td','th'])
    maroon5_songs.append(columns[0].text.strip().replace('\'','').replace('"',''))

maroon5_songs
maroon5 = pd.DataFrame({'Song':maroon5_songs, 'Artist':'Maroon 5'})
maroon5

Unnamed: 0,Song,Artist
0,Animals,Maroon 5
1,Back At Your Door,Maroon 5
2,Beautiful Goodbye,Maroon 5
3,Beautiful Mistakes (featuring Megan Thee Stall...,Maroon 5
4,Better That We Break,Maroon 5
...,...,...
103,Whiskey (featuring ASAP Rocky),Maroon 5
104,Who I Am (featuring LunchMoney Lewis),Maroon 5
105,Wipe Your Eyes,Maroon 5
106,Woman,Maroon 5


### Wikipedia - List of Most Streamed Songs in the UK

In [22]:
url = "https://en.wikipedia.org/wiki/List_of_most-streamed_songs_in_the_United_Kingdom"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [23]:
table = soup.find('table', class_='wikitable')

In [24]:
songs = []
uk_artists = []
row_count = 0 

#the table has a non-conforming last row with comments. So using row-count variable to hit till the end
#which using trial and error found it as 48. 15 and 16 is missing from the table in wiki

for row in table.find_all('tr')[1:]:
    columns = row.find_all(['td','th'])
    songs.append(columns[1].text.strip().replace('"', ''))
    uk_artists.append(columns[2].text.strip())
    
    row_count+=1
    if row_count >= 48 :
        break
#songs = songs.to_str.replace('"','')
uk_songs = pd.DataFrame({'Song':songs, 'Artist':uk_artists})
#uk_songs

In [25]:
top_songs = pd.concat([top_100, maroon5, uk_songs])
top_songs.shape

(256, 2)

In [26]:
top_songs = top_songs.reset_index()
top_songs = top_songs.drop('index', axis=1)
top_songs.tail()

Unnamed: 0,Song,Artist
251,Vossi Bop,Stormzy
252,Photograph,Ed Sheeran
253,Sorry,Justin Bieber
254,Closer,The Chainsmokers featuring Halsey
255,Do I Wanna Know?,Arctic Monkeys


In [27]:
top_songs.shape

(256, 2)

### Song Recommender First Iteration
- Ask for user input
- If song present in the list, then recommend another song from the list
- If not, a polite message

In [28]:
def song_recommender(title, artist):
    #Convert the input to lower case
    title_input = remove_special_characters(title.lower())
    artist_input = remove_special_characters(artist.lower())
    
    # Check if the input combination is present in the dataframe
    result = top_songs_lower[(top_songs_lower['Song'].values == title_input) & (top_songs_lower['Artist'].values == artist_input)]
    #print(result)

    if not result.empty:
        #Get the indices
        indices_to_remove = result.index
        temp_top_songs = top_songs.drop(indices_to_remove)

        rec_song = temp_top_songs.sample().squeeze()
        print(f"You might also like: {rec_song['Song']} by {rec_song['Artist']}")
    else:
        print('We are sorry but we cannot provide a recommendation right now. We are working on it!')

def remove_special_characters(text):
    # Use a regular expression to replace non-alphanumeric characters with an empty string
    return ''.join(e for e in text if e.isalnum() or e.isspace())

In [1]:
# Getting all lower str
top_songs_lower = top_songs.apply(lambda x: x.astype(str).str.lower())
top_songs_lower['Song'] = top_songs_lower['Song'].apply(remove_special_characters)
#top_songs_lower

NameError: name 'top_songs' is not defined

In [30]:
#Get user input
title_input = input('Please Enter the Song Title: ')
artist_input = input('Please Enter the Artist Name: ')
song_recommender(title_input, artist_input)

KeyboardInterrupt: Interrupted by user

In [31]:
top_songs.to_csv('top_songs.csv', index=False)