In [11]:
import pandas as pd
import numpy as np
import names

import random 
import string

In [16]:
def generate_id():
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k = 28))

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage of properties dataframe is :", start_mem_usg, " MB")
    NAlist = []  # Keeps track of columns that have missing values filled in.
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col].fillna(mn - 1, inplace=True)

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = props[col] - asint
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)

            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)

    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2
    print("Memory usage is: ", mem_usg, " MB")
    print("This is ", 100 * mem_usg / start_mem_usg, "% of the initial size")
    return props, NAlist


In [26]:
df = pd.read_csv("data.csv")

df, nalist = reduce_mem_usage(df)

Memory usage of properties dataframe is : 212.425950050354  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  119.41788482666016  MB
This is  56.21624137651404 % of the initial size


In [27]:
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,0,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.905998,210133,4,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,1,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,0,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.748993,298893,4,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,1,0.44,...,0.237,0.163,4e-06,0.121,0.574,96.751999,213640,4,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,0,0.426,...,0.0701,0.00162,0.105,0.0789,0.539,127.058998,205600,4,1999,1999-11-02


In [28]:
df.columns

Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')

In [30]:
fake_names = [names.get_full_name() for _ in range(1000)]
fake_ids = [i for i in range(1000)]

df["user_id"] = np.random.choice(fake_ids, size=len(df))
df["user_name"] = df["user_id"].apply(lambda x : fake_names[x])

In [4]:
# create a knn model that can recommend users to other users based on the song features

song_features = df.iloc[:, 9:20]

from sklearn.neighbors import NearestNeighbors

# create a model
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(song_features)

# get the song features for a user
user_features = df[df["user_id"] == 0][["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]]

# get nearest neighbors users
distances, indices = model.kneighbors(user_features, n_neighbors=5)

# get names of nearest users
names = df.iloc[indices[0]]["name"].values
print(names)

['Victor Glenn' 'Lucile Ramirez' 'Louis Quint' 'Johnny Granata'
 'Jacquelyn Medel']


TODO 

- compute a score for nearest users from 0 to 100
- figure out a good way to evaluate the model (use genre? if similar genre then good classification) 
- try other algorithms (SVD, NMF, etc), see if they work better