# Joining both dataframes

## 1. Import libraries and set  options for visualization:

In [1]:
import numpy as np
import pandas as pd
import os

# Setting PATH
PATH = os.getcwd()

# Setting options for DataFrame visualization:
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 122)

## 2. Import both dataframes

In [2]:
sp_songs = pd.read_csv('%s/output/spotify_songs_filtered.csv'%PATH)

In [3]:
wp_hits = pd.read_csv('%s/output/wiki_hits_spain.csv'%PATH)

## 3. Exploring and preparing df for joining

In [4]:
sp_songs.head()

Unnamed: 0,track_popularity,track_name,track_artists,track_album_name,track_href,track_album_href,track_id,track_album_id,track_album_release_date,track_type,track_album_type,track_is_local,owner_id
0,62.0,E.T.,['Katy Perry'],Katy Perry - Teenage Dream: The Complete Confe...,https://api.spotify.com/v1/tracks/4z8ssgZfs5Tz...,https://api.spotify.com/v1/albums/5BvgP623rtvl...,4z8ssgZfs5TzKiO8HaGWXO,5BvgP623rtvlc0HDcpzquz,2012-03-12,track,album,False,angelus_
1,66.0,Teenage Dream,['Katy Perry'],Katy Perry - Teenage Dream: The Complete Confe...,https://api.spotify.com/v1/tracks/55qBw1900pZK...,https://api.spotify.com/v1/albums/5BvgP623rtvl...,55qBw1900pZKfXJ6Q9A2Lc,5BvgP623rtvlc0HDcpzquz,2012-03-12,track,album,False,angelus_
2,70.0,Firework,['Katy Perry'],Katy Perry - Teenage Dream: The Complete Confe...,https://api.spotify.com/v1/tracks/4lCv7b86sLyn...,https://api.spotify.com/v1/albums/5BvgP623rtvl...,4lCv7b86sLynZbXhfScfm2,5BvgP623rtvlc0HDcpzquz,2012-03-12,track,album,False,angelus_
3,15.0,Roar,['Katy Perry'],PRISM (Deluxe),https://api.spotify.com/v1/tracks/3bDGwl0X3EjQ...,https://api.spotify.com/v1/albums/4lFDt4sVpCni...,3bDGwl0X3EjQmIyFD1uif5,4lFDt4sVpCni9DRHRmDjgG,2013-01-01,track,album,False,angelus_
4,0.0,Legendary Lovers,['Katy Perry'],PRISM (Deluxe),https://api.spotify.com/v1/tracks/4kkDQNeYyLe9...,https://api.spotify.com/v1/albums/4lFDt4sVpCni...,4kkDQNeYyLe9MxFAncbkoa,4lFDt4sVpCni9DRHRmDjgG,2013-01-01,track,album,False,angelus_


In [5]:
sp_songs.shape

(10659, 13)

In [6]:
wp_hits.head()

Unnamed: 0,year,week,date,track,artists,url
0,2011,1,1 de enero,Loca,"['Shakira', 'El Cata']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
1,2011,2,8 de enero,Barbie de extrarradio,['Melendi'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
2,2011,3,15 de enero,Love the Way You Lie,"['Eminem', 'Rihanna']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
3,2011,4,22 de enero,The Time (Dirty Bit),['The Black Eyed Peas'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
4,2011,5,29 de enero,The Time (Dirty Bit),['The Black Eyed Peas'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...


In [7]:
wp_hits.shape

(447, 6)

### Dropping some of the columns of sp_songs:

In [8]:
sp_songs = sp_songs[['track_popularity', 'track_name', 'track_artists', 'track_album_name', 
                    'track_href', 'track_album_release_date', 'owner_id']]

In [9]:
sp_songs.shape

(10659, 7)

### Joining both dataframes on basis of song names:

#### An inner join is performed, to check for the tracks that are inside the playlists and in the hit lists:

In [10]:
# left_on and right_on are used because the name of the columns are different:
common_df = pd.merge(wp_hits,
                    sp_songs,
                    left_on='track',
                    right_on='track_name',
                    how='inner')

In [11]:
common_df.head()

Unnamed: 0,year,week,date,track,artists,url,track_popularity,track_name,track_artists,track_album_name,track_href,track_album_release_date,owner_id
0,2011,4,22 de enero,The Time (Dirty Bit),['The Black Eyed Peas'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,2.0,The Time (Dirty Bit),['The Black Eyed Peas'],The Time (Dirty Bit),https://api.spotify.com/v1/tracks/34q1KaLX8h73...,2010-01-01,angelus_
1,2011,5,29 de enero,The Time (Dirty Bit),['The Black Eyed Peas'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,2.0,The Time (Dirty Bit),['The Black Eyed Peas'],The Time (Dirty Bit),https://api.spotify.com/v1/tracks/34q1KaLX8h73...,2010-01-01,angelus_
2,2011,6,5 de febrero,Hold My Hand,"['Michael Jackson', 'Akon']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,57.0,Hold My Hand,['Jess Glynne'],I Cry When I Laugh,https://api.spotify.com/v1/tracks/1eOJAiCKFuMd...,2013,42416345k
3,2011,7,12 de febrero,Hold My Hand,"['Michael Jackson', 'Akon']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,57.0,Hold My Hand,['Jess Glynne'],I Cry When I Laugh,https://api.spotify.com/v1/tracks/1eOJAiCKFuMd...,2013,42416345k
4,2011,15,9 de abril,Born This Way,['Lady Gaga'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,46.0,Born This Way,['Lady Gaga'],Born This Way (International Standard Version),https://api.spotify.com/v1/tracks/209NkbzmsUa9...,2011-01-01,angelus_


In [12]:
common_df.shape

(261, 13)

As the merge was performed using only the track name, now we have to check for all those entries with different artists, in order to manipulate the data accordingly (modify or drop):

In [13]:
differences = common_df.loc[common_df.artists != common_df.track_artists]
differences.shape

(121, 13)

In [14]:
differences.head()

Unnamed: 0,year,week,date,track,artists,url,track_popularity,track_name,track_artists,track_album_name,track_href,track_album_release_date,owner_id
2,2011,6,5 de febrero,Hold My Hand,"['Michael Jackson', 'Akon']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,57.0,Hold My Hand,['Jess Glynne'],I Cry When I Laugh,https://api.spotify.com/v1/tracks/1eOJAiCKFuMd...,2013,42416345k
3,2011,7,12 de febrero,Hold My Hand,"['Michael Jackson', 'Akon']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,57.0,Hold My Hand,['Jess Glynne'],I Cry When I Laugh,https://api.spotify.com/v1/tracks/1eOJAiCKFuMd...,2013,42416345k
22,2011,51,17 de diciembre,Titanium,"['David Guetta', 'Sia']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,46.0,Titanium,['Jasmine Thompson'],Bundle of Tantrums,https://api.spotify.com/v1/tracks/406dEJjjP5f5...,2013-09-06,dannig012
23,2011,52,24 de diciembre,Titanium,"['David Guetta', 'Sia']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,46.0,Titanium,['Jasmine Thompson'],Bundle of Tantrums,https://api.spotify.com/v1/tracks/406dEJjjP5f5...,2013-09-06,dannig012
26,2012,24,16 de junio,Somebody That I Used to Know,"['Gotye', 'Kimbra']",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,39.0,Somebody That I Used to Know,['Walk Off the Earth'],Somebody That I Used to Know,https://api.spotify.com/v1/tracks/0VVMuMXtkASP...,2012-02-06,angelus_


In [15]:
# Only these indexes share the same artist, the rest are really different songs/versions:
ind = [63, 74, 75, 207, 208]
for i in ind:
    common_df['artists'].iloc[i] = common_df['track_artists'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
common_df[['artists', 'track_artists']].iloc[63]

artists          ['P!nk']
track_artists    ['P!nk']
Name: 63, dtype: object

In [17]:
# Checking that the substitutions were correctly made:
differences = common_df.loc[common_df.artists != common_df.track_artists]
differences.shape

# Making a definitive dataframe where only the coincidental tracks and artists are included:
final_df = common_df.loc[common_df.artists == common_df.track_artists]
final_df.shape

(143, 13)

## 4. Exploring and exporting final dataframe

In [18]:
final_df.head()

Unnamed: 0,year,week,date,track,artists,url,track_popularity,track_name,track_artists,track_album_name,track_href,track_album_release_date,owner_id
0,2011,4,22 de enero,The Time (Dirty Bit),['The Black Eyed Peas'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,2.0,The Time (Dirty Bit),['The Black Eyed Peas'],The Time (Dirty Bit),https://api.spotify.com/v1/tracks/34q1KaLX8h73...,2010-01-01,angelus_
1,2011,5,29 de enero,The Time (Dirty Bit),['The Black Eyed Peas'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,2.0,The Time (Dirty Bit),['The Black Eyed Peas'],The Time (Dirty Bit),https://api.spotify.com/v1/tracks/34q1KaLX8h73...,2010-01-01,angelus_
4,2011,15,9 de abril,Born This Way,['Lady Gaga'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,46.0,Born This Way,['Lady Gaga'],Born This Way (International Standard Version),https://api.spotify.com/v1/tracks/209NkbzmsUa9...,2011-01-01,angelus_
5,2011,15,9 de abril,Born This Way,['Lady Gaga'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,68.0,Born This Way,['Lady Gaga'],Born This Way (International Special Edition V...,https://api.spotify.com/v1/tracks/30XU4suKzCeo...,2011-01-01,dannig012
6,2011,15,9 de abril,Born This Way,['Lady Gaga'],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...,6.0,Born This Way,['Lady Gaga'],Born This Way,https://api.spotify.com/v1/tracks/6wAdqKcL7uES...,2011-01-01,popersea


In [19]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 0 to 243
Data columns (total 13 columns):
year                        143 non-null int64
week                        143 non-null int64
date                        143 non-null object
track                       143 non-null object
artists                     143 non-null object
url                         143 non-null object
track_popularity            143 non-null float64
track_name                  143 non-null object
track_artists               143 non-null object
track_album_name            143 non-null object
track_href                  143 non-null object
track_album_release_date    143 non-null object
owner_id                    143 non-null object
dtypes: float64(1), int64(2), object(10)
memory usage: 15.6+ KB


### Top 5 people according to amount of #1's in their playlists:

In [20]:
final_df.groupby('owner_id')['track'].count().sort_values(ascending=False)[:5]

owner_id
dannig012       39
angelus_        22
elsapato        15
nestorviolin    14
aechos          10
Name: track, dtype: int64

### Exporting dataframe

In [21]:
final_df_export = final_df.to_csv('%s/output/final_df.csv'%PATH, index=False)