In [2]:
import pandas as pd
import numpy as np
import names

import random 
import string

In [3]:
def generate_id():
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k = 28))

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage of properties dataframe is :", start_mem_usg, " MB")
    NAlist = []  # Keeps track of columns that have missing values filled in.
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col].fillna(mn - 1, inplace=True)

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = props[col] - asint
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)

            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)

    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage is: ", mem_usg, " MB")
    print("This is ", 100 * mem_usg / start_mem_usg, "% of the initial size")
    return props, NAlist


In [5]:
df = pd.read_csv("data.csv")
df, nalist = reduce_mem_usage(df)

Memory usage of properties dataframe is : 212.425950050354  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  119.41788482666016  MB
This is  56.21624137651404 % of the initial size


In [6]:
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,0,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.905998,210133,4,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,1,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,0,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.748993,298893,4,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,1,0.44,...,0.237,0.163,4e-06,0.121,0.574,96.751999,213640,4,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,0,0.426,...,0.0701,0.00162,0.105,0.0789,0.539,127.058998,205600,4,1999,1999-11-02


In [7]:
df.columns

Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')

In [8]:
fake_names = [names.get_full_name() for _ in range(1000)]
fake_ids = [i for i in range(1000)]

df["user_id"] = np.random.choice(fake_ids, size=len(df))
df["user_name"] = df["user_id"].apply(lambda x : fake_names[x])

In [None]:
# create a knn model that can recommend users to other users based on the song features

song_features = df.iloc[:, 9:20]

from sklearn.neighbors import NearestNeighbors

# create a model
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(song_features)

In [None]:
user_features = df[df["user_id"] == 0][["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]]

# get nearest neighbors users
distances, indices = model.kneighbors(user_features, n_neighbors=5)

raw_recommends = \
    sorted(
        list(
            zip(
                indices.squeeze().tolist(),
                distances.squeeze().tolist()
            )
        ),
        key=lambda x: x[1]
    )[:0:-1]

In [42]:
from pprint import pprint
recs = raw_recommends[:20]

pprint(recs)

[([768570, 208965, 1149537, 1014200, 832268],
  [1.7881393432617188e-07,
   7.152557373046875e-06,
   9.000301361083984e-06,
   9.059906005859375e-06,
   1.0192394256591797e-05]),
 ([862665, 1119018, 524964, 718224, 638627],
  [1.7881393432617188e-07,
   5.424022674560547e-06,
   6.9141387939453125e-06,
   8.046627044677734e-06,
   8.046627044677734e-06]),
 ([384790, 1041510, 56543, 1106434, 19672],
  [1.7881393432617188e-07,
   2.562999725341797e-06,
   2.6226043701171875e-06,
   3.3974647521972656e-06,
   3.4570693969726562e-06]),
 ([849457, 295418, 561946, 479794, 866606],
  [1.7881393432617188e-07,
   1.9669532775878906e-06,
   2.1457672119140625e-06,
   2.3245811462402344e-06,
   2.5033950805664062e-06]),
 ([247413, 675459, 804249, 694271, 863748],
  [1.7881393432617188e-07,
   1.3709068298339844e-06,
   2.0265579223632812e-06,
   2.384185791015625e-06,
   2.4437904357910156e-06]),
 ([505899, 168210, 752180, 312437, 1026328],
  [1.7881393432617188e-07,
   1.2516975402832031e-06,
 

In [41]:
def flatten(l):
    return [item for sublist in l for item in sublist]


# merge the first index in tuple of all recs
row_idxs = flatten([i[0] for i in recs])

user_ids = {df.iloc[idx]["id"] for idx in row_idxs}



#print('Recommendations for user 0 ')
#for i, (idx, dist) in enumerate(raw_recommends):
#    print('{0}: {1}, with distance '
#            'of {2}'.format(i+1, idx, dist))

{'4sEGyZEQKRkznbNWlbX4eT', '1OsymWyTXGnP4hRnK9uo1D', '5VoKSE88kXQQlxywMMpD18', '4YEEsSKwY8TE6OU3XJmylr', '1yBxHgW619c6V9YhJk52yZ', '0dtEAwStWUy4L1mh0f4ITe', '039jlbBLIRwLc4qYi9ebE6', '0zPhy3NsUf7hrYLqQPZDnY', '4FxsH7x3UAf9LF4sZ8Lafk', '4cyN7ryIs2ndFaIMgB4JyV', '02pRwCxVCQbB1jOYHQGaSA', '3JsMAn6uorNkrrj34SSDbB', '0KquYlxfgzdyVbKgYfDgEx', '3eTfQREx6QzkqVHAooVJH0', '5GgzVKrpw1QRFifElSttM8', '5fihSggqWM5vABDBrKxGhE', '6oG5JcSusIxE0JNejLgjAL', '5iaskvB5DrOS6v42m1Uwqc', '0bbCEbuZ7K751i7sdQbPfR', '0IdPLOtt88wY37l0iHHlyX', '1gdkzvXvKprGRjdiIW81ml', '4ZdSq4nzjlpscn8PT6dxLb', '0W0riaIxiuIp7LLyGzx8eS', '7FuRNTc4IV31HVuQBZVoK1', '38nXaLhPrTndHn5UAl0djH', '6KN0KsQOZDbcGDjwrPxBt4', '6NT0SisXl8Xfw0zPUBhuE1', '0wsyjvXeeQX3m2BJ1f0iAZ', '7wNFFvF82NiypOielyAFKy', '20taS8gd6OJdH4mRtF013G', '43xqktKoH5TTuAJntOWhti', '6RoHoRe1O0fMfnJ558ge0S', '1Y8KllH6zJ8jlfa8PUEMGX', '7tUOJ8InYT7qw5BCUfLY4M', '6ZTtnyITHMbTpVu6LbCkNw', '7EDHKZEXrdxGM4alVJmSGc', '5YVRk2QVGmitcGQkMFPNeY', '0eErPAnW4Ib29A8NPmc4ug', '6AUTQ0NV03

In [32]:
# given user_id, get username and 5 songs that the user likes

user_id = 0

data = [{"user_id" : 0,
        "user_name" :"john",
        "songs" : ["song1", "song2", "song3", "song4", "song5"],}]

SyntaxError: invalid syntax (3400048881.py, line 5)

In [25]:
df.iloc[[768570, 208965, 1149537, 1014200, 832268], :]

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,user_id,user_name
768570,1Z5raqP55Rf3hJW1cLPgtV,Tears for a Son - Live,Welcome to Atlanta Live 2014,3v33I8IQL13YdNzVXZ0fD5,['Seventh Wonder'],['30nUQaYp91iDcRyhr1oNn0'],7,1,0,0.295,...,8.8e-05,0.336,0.188,73.418999,110000,3,2016,2016-09-23,0,Ann Eytcheson
208965,5kb8hqeLEXspWJZ0LrJPqE,Happy Days Are Here Again,The Barbra Streisand Album: Arranged and Condu...,6lJaBPh4nMloLfWh2SQI6x,['Barbra Streisand'],['7jmTilWYlKOuavFfmQAcu6'],7,1,0,0.363,...,3e-06,0.181,0.199,73.734001,183973,3,1963,1963-02-25,902,Richard Conley
1149537,0W0riaIxiuIp7LLyGzx8eS,When Pushed From A High Branch,My Oh My Avalanche,5bFdPQUMhV8o59HUjhKBpD,['Snowblink'],['6oYtsBDA5uQzV8FdGDvXZM'],3,1,0,0.37,...,0.000431,0.157,0.177,83.0,264147,4,2006,2006-12-21,882,Cory Smith
1014200,5YVRk2QVGmitcGQkMFPNeY,In the Garden,Nature's Glory: Inspirational Hymns,1CewxgtbX1OOKwwsIHbg0V,"[""Dan Gibson's Solitudes""]",['0Wv8mXdyQifZvmp6SKXRwr'],3,1,0,0.187,...,0.0785,0.125,0.195,82.650002,217227,3,2013,2013-11-21,706,Ellen Valverde
832268,11QABJKFCiRcxfHdfXxjBF,Sarno Grabs Whitey,Prehysteria! (Original Motion Picture Soundtrack),6d1sGHCoiQ8TPLOnPYqIk6,['Richard Band'],['0TXK10MEUmsJGH4lMosMiT'],10,1,0,0.411,...,0.0129,0.0814,0.244,82.138,79187,4,1993,1993,568,Daryl Crosby


In [10]:
scores = (1 - distances[0]) * 100
print(scores)

[99.999985 99.999855 99.99981  99.9998   99.99977 ]


TODO 

- compute a score for nearest users from 0 to 100
- figure out a good way to evaluate the model (use genre? if similar genre then good classification) 
- try other algorithms (SVD, NMF, etc), see if they work better

In [17]:
def get_nearest_users(song_features):
    distances, indices = model.kneighbors(song_features, n_neighbors=5)
    names = df.iloc[indices[0]]["user_name"].values
    song1 = df.iloc[indices[0]]["name"].values
    song2 = df.iloc[indices[1]]["name"].values
    scores = (1 - distances[0]) * 100
    user1 = {"name": names[0], "song1": song1[0], "song2": song2[0], "score": scores[0]}
    user2 = {"name": names[1], "song1": song1[1], "song2": song2[1], "score": scores[1]}
    user3 = {"name": names[2], "song1": song1[2], "song2": song2[2], "score": scores[2]}
    user4 = {"name": names[3], "song1": song1[3], "song2": song2[3], "score": scores[3]}
    user5 = {"name": names[4], "song1": song1[4], "song2": song2[4], "score": scores[4]}
    return [user1, user2, user3, user4, user5]

print(get_nearest_users(user_features))

[{'name': 'Jennifer Rivera', 'song1': 'Chattanooga Choo Choo', 'song2': 'See Right Through You', 'score': 99.999985}, {'name': 'Phylis Westley', 'song1': 'Step to My Girl (Acoustic)', 'song2': 'You Want It All (Feat. Johnny Chimes)', 'score': 99.999855}, {'name': 'Robert Windle', 'song1': 'A Job Well Done', 'song2': 'Fuck Just To Fuck', 'score': 99.99981}, {'name': 'Angeline Cochran', 'song1': 'If You Care', 'song2': 'Mira para Arriba Mira para Abajo - En Vivo', 'score': 99.9998}, {'name': 'William Cannon', 'song1': "He's a Good Lad", 'song2': 'All the Hype', 'score': 99.99977}]
