In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
members = pd.read_csv('dataset/members.csv', dtype={'city': str})
songs = pd.read_csv('dataset/songs.csv', dtype={'genre_ids': str, 'language': str})
extra = pd.read_csv('dataset/song_extra_info.csv')


In [81]:
#Reduce the number of features for the baseline models

songs.drop('composer', axis=1, inplace=True)
songs.drop('lyricist', axis=1, inplace=True)
members.drop('registered_via', axis=1, inplace=True)
members.drop('registration_init_time', axis=1, inplace=True)
members.drop('expiration_date', axis=1, inplace=True)


In [82]:
#Replace missing values with "unknown" or infer the value based on other features

songs.loc[605127, 'language'] = 31.0

#Create lists of genre ids in the genre id column

songs['genre_ids'] = songs['genre_ids'].str.split('|')


In [None]:
#Create a dataframe that stores genre IDs across multiple columns (one genre per column)

genres = songs[h'genre_ids'].apply(pd.Series).add_prefix('genre_')
genres.to_csv('genres.csv')


In [85]:
genres = pd.read_csv('dataset/genres.csv', dtype=str)

In [20]:
songs[songs['genre_ids'].str.len() > 2]

Unnamed: 0,song_id,song_length,genre_ids,artist_name,language
5,kKJ2JNU5h8rphyW21ovC+RZU+yEHPM+3w85J37p7vEQ=,235520,"[864, 857, 850, 843]",貴族精選,17.0
111,Dm3IIPOFbrvJp5yuJIlZI0pfdWyMAB0F9b2J6rIW0bA=,194272,"[139, 125, 109]",Solange,52.0
392,YmLP0tRXZsdlgW9my+EA3k5qWavSN8DBtt9mOEmQ9M0=,60511,"[864, 857, 850, 843]",軒轅劍 電玩原聲帶,-1.0
533,g0jP5lxmTYmWc0spF4/KEbNy0RxKsMXHZ4kDKPUaZTY=,241673,"[139, 125, 109]",Donell Jones,52.0
598,HhXTTJEnv4oNADxVvwzmTzjvFy8ZVQFDIFUFLLoaLQQ=,267912,"[786, 2086, 374]",洪億展,-1.0
634,kR+iNjotdUty+3L9jxpIgIXvL6ZTmpvLexxlht9smBY=,110294,"[864, 857, 850, 843]",軒轅劍 電玩原聲帶,-1.0
682,anJ+jo1TFI3eOFErE/BSbVr3izwYpCF+LeDMLQMM7/Y=,270791,"[139, 125, 109]",Whitney Houston,52.0
702,acUENdvvZWfRZRTsg4FuK/egicr9kpOAeR4dA1IAoGQ=,210604,"[139, 125, 109]",John Legend,52.0
716,P+kcPEzuzWgXwaL71CTcI2fAFauIZ/Eu4pLsEK4HZUE=,369162,"[139, 125, 109]",MISIA,17.0
846,Z3R13SqM+GN6L0+t12E2EXK29XQ9dcTRSoRrla5Egsc=,285419,"[139, 125, 109]",InstaHit Crew,52.0


In [51]:

pd.get_dummies(genres['genre_0'])

Unnamed: 0,1000,1007,1011,1019,1026,1033,1040,1047,1054,1061,...,94,940,947,95,958,965,972,979,986,993
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
genres


Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,465,,,,,,,
1,444,,,,,,,
2,465,,,,,,,
3,465,,,,,,,
4,726,,,,,,,
5,864,857,850,843,,,,
6,458,,,,,,,
7,465,,,,,,,
8,465,,,,,,,
9,352,1995,,,,,,


In [100]:
#Load feature columns into Tensorflow

target =tf.feature_column.categorical_column_with_identity(key='target', num_buckets=2)

gender = tf.feature_column.categorical_column_with_vocabulary_list(key='gender',
                                                                   vocabulary_list=('female', 'male'),
                                                                   dtype=tf.string,
                                                                   default_value='unknown')

city = tf.feature_column.categorical_column_with_vocabulary_list(key='city',
                                                          vocabulary_list=members['city'].unique(),
                                                          dtype=tf.string,
                                                          default_value='unknown')

language = tf.feature_column.categorical_column_with_vocabulary_list(key='language',
                                                                     vocabulary_list=songs['language'].unique(),
                                                                     dtype=tf.string,
                                                                     default_value='unknown')

artist = tf.feature_column.categorical_column_with_vocabulary_list(key='artist_name',
                                                                   vocabulary_list=songs['artist_name'].unique(),
                                                                   dtype=tf.string,
                                                                   default_value='unknown')

tab = tf.feature_column.categorical_column_with_vocabulary_list(key='source_system_tab',
                                                                vocabulary_list=train['source_system_tab'].unique(),
                                                                dtype=tf.string,
                                                                default_value='unknown')

screen = tf.feature_column.categorical_column_with_vocabulary_list(key='source_screen_name',
                                                                   vocabulary_list=train['source_screen_name'].unique(),
                                                                   dtype=tf.string,
                                                                   default_value='unknown')

source_type = tf.feature_column.categorical_column_with_vocabulary_list(key='source_type',
                                                                        vocabulary_list=train['source_type'].unique(),
                                                                        dtype=tf.string,
                                                                        default_value='unknown')

length = tf.feature_column.numeric_column(key='song_length',
                                          default_value=-1,
                                          dtype=tf.int32)

#Bucket member age into age ranges, with nonsensical values going into the 0-14 or the >80 buckets

age = tf.feature_column.numeric_column(key='bd',
                                       default_value=0,
                                       dtype=tf.int32)

age_bucket = tf.feature_column.bucketized_column(age, boundaries=[0, 14, 20, 30, 40, 50, 80])

#Bucket the following features using a hash table with a size of approximately (n/0.8)*2

msno = tf.feature_column.categorical_column_with_hash_bucket(key='msno',
                                                               hash_bucket_size=90000,
                                                               dtype=tf.string)

song_id = tf.feature_column.categorical_column_with_hash_bucket(key='song_id',
                                                             hash_bucket_size=6000000,
                                                             dtype=tf.string)

genre = tf.feature_column.categorical_column_with_vocabulary_list(key='genre_ids',
                                                                  vocabulary_list=genres['genre_0'].unique(),
                                                                  dtype=tf.string,
                                                                  default_value='unknown')

indicator_genre = tf.feature_column.indicator_column(genre)
embedded_genre = tf.feature_column.embedding_column(genre, dimension=10)
hashed_genre = tf.feature_column.categorical_column_with_hash_bucket(key='genre_ids',
                                                                     hash_bucket_size=3000,
                                                                     dtype=tf.string)


In [None]:
interactions = []
#age bucket x city
#age bucket x artist
#gender x age bucket
#genre x age bucket
#language x age bucket
#language x city
#genre x city
#genre x gender
#genre x age bucket


"""create new features:
how many songs has a user listened to in the dataset?
create # of songs user listened to feature (continuous)
average length of songs listened to by the user
number of different languages a user listens to music in
number of different genres a user listens to
"""

In [103]:
songs

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0
5,kKJ2JNU5h8rphyW21ovC+RZU+yEHPM+3w85J37p7vEQ=,235520,864|857|850|843,貴族精選,Joe Hisaishi,Hayao Miyazaki,17.0
6,N9vbanw7BSMoUgdfJlgX1aZPE1XZg8OS1wf88AQEcMc=,226220,458,伍佰 & China Blue,Jonathan Lee,,3.0
7,GsCpr618xfveHYJdo+E5SybrpR906tsjLMeKyrCNw8s=,276793,465,光良 (Michael Wong),光良,彭資閔,3.0
8,oTi7oINPX+rxoGp+3O6llSltQTl80jDqHoULfRoLcG4=,228623,465,林俊傑 (JJ Lin),JJ Lin,Wu Qing Feng,3.0
9,btcG03OHY3GNKWccPP0auvtSbhxog/kllIIOx5grE/k=,232629,352|1995,Kodaline,Stephen Garrigan| Mark Prendergast| Vincent Ma...,Stephen Garrigan| Mark Prendergast| Vincent Ma...,52.0
