# Objective: To Build a song Recommender

# Dataset used: Million Songs Dataset 
    
Source: http://labrosa.ee.columbia.edu/millionsong/ 

# Q1. Load Required Libraries and music data

In [1]:
import pandas as pd
import numpy as np

In [9]:
triplets_file = pd.read_csv('https://static.turi.com/datasets/millionsong/10000.txt',delimiter='\t',header=None)

In [10]:
triplets_file.head()

Unnamed: 0,0,1,2
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [12]:
triplets_file.columns = ['user_id', 'song_id', 'listen_count']

In [13]:
triplets_file.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [11]:
songs_metadata_file = pd.read_csv('https://static.turi.com/datasets/millionsong/song_data.csv')

In [14]:
songs_metadata_file.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [17]:
song_df = pd.merge(triplets_file, songs_metadata_file.drop_duplicates(['song_id']), on="song_id", how="left") 

In [18]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


Load Music data:
    
Read userid-songid-listen_count triplets
This step might take time to download data from external sources

triplets_file 'https://static.turi.com/datasets/millionsong/10000.txt'

songs_metadata_file 'https://static.turi.com/datasets/millionsong/song_data.csv'

Hint:
Merge the two dataframes above to create input dataframe for recommender systems(triplet and metadata file)

# Q2. Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

# Length of the dataset

In [19]:
len(song_df.index)

2000000

In [20]:
song_df.shape

(2000000, 7)

# Q3. Create a subset of the dataset

a. Show the most popular songs in the dataset

b. Count number of unique users in the dataset

c. Count the number of unique songs in the dataset

In [21]:
song_df = song_df.head(10000)

In [22]:
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

In [23]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly - Foo Fighters


In [24]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()

In [25]:
song_grouped.head()

Unnamed: 0,song,listen_count
0,#40 - DAVE MATTHEWS BAND,1
1,& Down - Boys Noize,4
2,'97 Bonnie & Clyde - Eminem,2
3,'Round Midnight - Miles Davis,3
4,'Till I Collapse - Eminem / Nate Dogg,6


In [26]:
grouped_sum = song_grouped['listen_count'].sum()

In [28]:
grouped_sum

10000

In [29]:
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100

In [30]:
song_grouped.head()

Unnamed: 0,song,listen_count,percentage
0,#40 - DAVE MATTHEWS BAND,1,0.01
1,& Down - Boys Noize,4,0.04
2,'97 Bonnie & Clyde - Eminem,2,0.02
3,'Round Midnight - Miles Davis,3,0.03
4,'Till I Collapse - Eminem / Nate Dogg,6,0.06


In [31]:
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch - Harmonia,45,0.45
4678,Undo - Björk,32,0.32
5105,You're The One - Dwight Yoakam,32,0.32
1071,Dog Days Are Over (Radio Edit) - Florence + Th...,28,0.28
3655,Secrets - OneRepublic,28,0.28
...,...,...,...
5139,high fives - Four Tet,1,0.01
5140,in white rooms - Booka Shade,1,0.01
5143,paranoid android - Christopher O'Riley,1,0.01
5149,¿Lo Ves? [Piano Y Voz] - Alejandro Sanz,1,0.01


In [32]:
users = song_df['user_id'].unique()
len(users)

365

# Count number of unique users in the dataset

# Count the number of unique songs in the dataset

In [33]:
###Fill in the code here
songs = song_df['song'].unique()
len(songs)

5151

# Q4. Create a song recommender

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
train_data.head(5)

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
7389,94d5bdc37683950e90c56c9b32721edb5d347600,SOXNZOW12AB017F756,2,Half Of My Heart,Battle Studies,John Mayer,0,Half Of My Heart - John Mayer
9275,1012ecfd277b96487ed8357d02fa8326b13696a5,SOXHYVQ12AB0187949,1,The Beautiful People,Antichrist Superstar (Ecopac Explicit),Marilyn Manson,0,The Beautiful People - Marilyn Manson
2995,15415fa2745b344bce958967c346f2a89f792f63,SOOSZAZ12A6D4FADF8,1,Sanctify Yourself,Glittering Prize 81/92,Simple Minds,1985,Sanctify Yourself - Simple Minds
5316,ffadf9297a99945c0513cd87939d91d8b602936b,SOWDJEJ12A8C1339FE,4,Heart Cooks Brain,Everything Is Nice: The Matador Records 10th A...,Modest Mouse,1997,Heart Cooks Brain - Modest Mouse
356,5a905f000fc1ff3df7ca807d57edb608863db05d,SOAMPRJ12A8AE45F38,20,Rorol,Identification Parade,Octopus Project,2002,Rorol - Octopus Project


# Q5. Build Popularity Recommender model. (Non-personalised)

a. Count of user_id for each unique song as recommendation score 

b. Sort the songs on recommendation score 

c. Get the top 5 recommendations

In [36]:
train_data_grouped = train_data.groupby('song_id').agg({'user_id': 'count'}).reset_index()

In [37]:
train_data_grouped.head()

Unnamed: 0,song_id,user_id
0,SOAAFAC12A67ADF7EB,2
1,SOAATLI12A8C13E319,1
2,SOAAUKC12AB017F868,1
3,SOAAVUV12AB0186646,4
4,SOAAWEE12A6D4FBEC8,2


In [38]:
train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
train_data_grouped.head()

Unnamed: 0,song_id,score
0,SOAAFAC12A67ADF7EB,2
1,SOAATLI12A8C13E319,1
2,SOAAUKC12AB017F868,1
3,SOAAVUV12AB0186646,4
4,SOAAWEE12A6D4FBEC8,2


In [39]:
#Sort the songs on recommendation score 
train_data_sort = train_data_grouped.sort_values(['score', 'song_id'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations 

Unnamed: 0,song_id,score,Rank
1006,SOFRQTD12A81C233C0,37,1.0
146,SOAUWYT12A81C206F1,27,2.0
166,SOAXGDH12A8C13F8A1,24,3.0
286,SOBONKR12A58A7A7E0,24,4.0
2481,SONYKOW12AB01849C9,21,5.0


# Q6. Use popularity based recommender model to make predictions and find recommendations for random list of users with inferences

In [40]:
# Use popularity based recommender model to make predictions
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    #Add user_id column for which the recommendations are being generated 
    user_recommendations['userID'] = user_id 
      
    #Bring user_id column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations 

In [41]:
find_recom = [15,121,53]   # This list is user choice.
for i in find_recom:
    print("Here is the recommendation for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n") 

Here is the recommendation for the userId: 15

      userID             song_id  score  Rank
1006      15  SOFRQTD12A81C233C0     37   1.0
146       15  SOAUWYT12A81C206F1     27   2.0
166       15  SOAXGDH12A8C13F8A1     24   3.0
286       15  SOBONKR12A58A7A7E0     24   4.0
2481      15  SONYKOW12AB01849C9     21   5.0


Here is the recommendation for the userId: 121

      userID             song_id  score  Rank
1006     121  SOFRQTD12A81C233C0     37   1.0
146      121  SOAUWYT12A81C206F1     27   2.0
166      121  SOAXGDH12A8C13F8A1     24   3.0
286      121  SOBONKR12A58A7A7E0     24   4.0
2481     121  SONYKOW12AB01849C9     21   5.0


Here is the recommendation for the userId: 53

      userID             song_id  score  Rank
1006      53  SOFRQTD12A81C233C0     37   1.0
146       53  SOAUWYT12A81C206F1     27   2.0
166       53  SOAXGDH12A8C13F8A1     24   3.0
286       53  SOBONKR12A58A7A7E0     24   4.0
2481      53  SONYKOW12AB01849C9     21   5.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Since this is a popularity-based recommender model, recommendations remain the same for all users

We predict the products based on the popularity. It is not personalized to particular user