# Preprocessing data

## Setup

Проводим все нужные import'ы и объявления вспомогательных функций, запустив код из `Setup.ipynb`.

Код объемный и не хочется засорять основные IPython-ноутбуки ячейками, которые будут замыливать глаз и мешать. Будем соблюдать модульность. Тем более IPython-ноутбук с настройками можно дополнять по мере необходимости, а также использовать в других проектах.

In [1]:
run Setup.ipynb

Imports done!


## Persons

Считаем данные об артистах(авторах). В этих данных у нас имеется 5 столбцов.

In [2]:
persons_path = '../Data/entities/persons.idomaar'
persons_df = pd.read_csv(persons_path, sep='\t', header=None)

print(persons_df.shape)
persons_df.head()

(595049, 5)


Unnamed: 0,0,1,2,3,4
0,person,145148,-1,"{""MBID"":null, ""name"":""Everything+Is+Illuminated""}",{}
1,person,297899,-1,"{""MBID"":null, ""name"":""Robin+O%27Brien""}",{}
2,person,250429,-1,"{""MBID"":null, ""name"":""Nicholas+Gunn++(2012)""}",{}
3,person,32765,-1,"{""MBID"":null, ""name"":""Aspasia+Stratigou""}",{}
4,person,18689,-1,"{""MBID"":null, ""name"":""Allison+Veltz""}",{}


In [4]:
print(persons_df[0].unique())
print(persons_df[2].unique())
print(persons_df[4].unique())

['person']
[-1]
['{}']


Столбцы `0`, `2`, `4` не несут полезной информации, т.к. для всех записей таблицы они имеют одно и тоже значение. Поэтому от них мы сможем избавиться.

Столбец `1` является полем `id`, т.е. уникальным номером исполнителя

Столбец `3` представляет наибольшую ценность, в нем содержится информация об `MBID` и `name` данного автора. Этот столбец мы и разберем, создав новые колонки. 

In [8]:
persons_df['MBID'] = persons_df[3].apply(get_attr, attr='MBID')

persons_df['artist_name'] = persons_df[3].apply(get_attr, attr='name')
persons_df['artist_name'] = persons_df['artist_name'].apply(unquote)

persons_df.rename({1:'artist_id'}, axis=1, inplace=True)

persons_df.drop([0, 2, 3, 4], axis=1, inplace=True)
persons_df = persons_df.sample(frac=1.0).reset_index(drop=True)

print(persons_df.shape)
persons_df.head()

(595049, 3)


Unnamed: 0,artist_id,MBID,artist_name
0,208236,d61e77cc-80b3-4586-bf4e-91fe7f13165e,Luciano+Berio
1,332384,,Starpool
2,50834,,Black+Salvation
3,383635,,Tross
4,404407,a0813d18-e89f-48b5-a9d7-2315d59917c2,Wooh+Da+Kid


В итоге получаем датасет из более чем 500 тысяч исполнителей

## Tracks

In [4]:
# Reading raw tracks data from 30Music

tracks_path = '../Data/entities/tracks.idomaar'
tracks_df = pd.read_csv(tracks_path, sep='\t', header=None)
tracks_df = tracks_df.iloc[:, 1:].drop_duplicates() # Removing duplicates (1.1 million)

print(f'DataFrame shape: {tracks_df.shape}')
tracks_df.head()

DataFrame shape: (4544646, 4)


Unnamed: 0,1,2,3,4
0,0,-1,"{""duration"":-1,""playcount"":4,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":0}],""albums""..."
1,1,-1,"{""duration"":-1,""playcount"":495,""MBID"":null,""na...","{""artists"":[{""type"":""person"",""id"":1}],""albums""..."
2,2,-1,"{""duration"":-1,""playcount"":2,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":2}],""albums""..."
3,3,-1,"{""duration"":-1,""playcount"":2,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":3}],""albums""..."
4,4,-1,"{""duration"":-1,""playcount"":1,""MBID"":null,""name...","{""artists"":[{""type"":""person"",""id"":4}],""albums""..."


Convert the rubbish above from `3` and `4` columns using json to some Python-like data

In [4]:
tracks_df['playcount'] = tracks_df[3].apply(get_attr, attr='playcount').fillna(0)
tracks_df['MBID'] = tracks_df[3].apply(get_attr, attr='MBID')
tracks_df['artist'] = tracks_df[4].apply(get_array_attr, attr='artists')
tracks_df['artist'] = tracks_df['artist'].apply(lambda a: a[0])
tracks_df.rename({1: 'track_id'}, axis=1,inplace=True)

tracks_df.drop([2, 3, 4], axis=1, inplace=True) # remove redundant columns
tracks_df = tracks_df.sample(frac=1.0).reset_index(drop=True)

print(tracks_df.shape)
tracks_df.head()

NameError: name 'tracks_df' is not defined

Let's see what happens with artists distribution

In [3]:
artists_counts = tracks_df['artist'].value_counts()
artists_counts_names = [get_artist_name(id_, persons_df)
                        for id_ in artists_counts.index[:10]]

print('№', 'Artist', 'Count', sep='\t')
print(*zip(artists_counts.index[:10], artists_counts_names, artists_counts[:10]), sep='\n')

NameError: name 'tracks_df' is not defined

In [2]:
import pprint

<div class="alert alert-warning">

As we can see, `129` is just a `Original Soundtrack` we can treat it as a placeholder, it means that we can just pop it, because it tells us nothing about track.

</div>

In [7]:
artists_ids_to_stay = artists_counts[artists_counts >= 100].index[:1999]
tracks_df = tracks_df[tracks_df['artist'].isin(artists_ids_to_stay)]

persons_df = persons_df[persons_df['artist_id'].isin(artists_ids_to_stay)]

new_artists_counts = tracks_df['artist'].value_counts()

print(f'Number of unique authors: {tracks_df.artist.nunique()}')
print(f'Shape of track DataFrame: {tracks_df.shape}\n')
print(new_artists_counts[:10])

Number of unique authors: 1999
Shape of track DataFrame: (899799, 4)

37589     7109
240350    5397
76474     4033
276125    3402
43125     3384
56683     3357
140693    2949
42398     2939
110337    2909
27650     2596
Name: artist, dtype: int64


In [None]:
track_to_artist = {tracks_df.iloc[i, 0]: tracks_df.iloc[i, 3]
                   for i in tqdm_notebook(range(len(tracks_df)))}

with open('data/track_to_artist.pickle', 'wb') as f:
    pickle.dump(track_to_artist, f)

HBox(children=(IntProgress(value=0, max=899799), HTML(value='')))

In [None]:
tracks_df.to_csv('data/tracks.csv')
persons_df.to_csv('data/persons.csv')

del tracks_df
del persons_df


print('File persons.idomaar processed successfully!')
print('File tracks.idomaar processed successfully!')

## Playlists

In [22]:
playlist_path = '../Data/entities/playlist.idomaar'
playlist_df = pd.read_csv(playlist_path, sep='\t', header=None)

print(playlist_df.shape)
playlist_df.head()

(57561, 5)


Unnamed: 0,0,1,2,3,4
0,playlist,0,1216545588,"{""ID"":2973549,""Title"":""my_favorites"",""numtrack...","{""subjects"":[{""type"":""user"",""id"":41504}],""obje..."
1,playlist,1,1249326867,"{""ID"":5429703,""Title"":"""",""numtracks"":9,""durati...","{""subjects"":[{""type"":""user"",""id"":41504}],""obje..."
2,playlist,2,1257766688,"{""ID"":5926742,""Title"":""1989"",""numtracks"":16,""d...","{""subjects"":[{""type"":""user"",""id"":44542}],""obje..."
3,playlist,3,1248079275,"{""ID"":5353183,""Title"":""Fly me to the moon..."",...","{""subjects"":[{""type"":""user"",""id"":44542}],""obje..."
4,playlist,4,1175201268,"{""ID"":330569,""Title"":""The playlist who must no...","{""subjects"":[{""type"":""user"",""id"":44542}],""obje..."


In [23]:
print(playlist_df.iloc[0, 3], end='\n\n')
print(json.loads(playlist_df.iloc[0, 4]))

{"ID":2973549,"Title":"my_favorites","numtracks":27,"duration":6522}

{'subjects': [{'type': 'user', 'id': 41504}], 'objects': [{'type': 'track', 'id': 3006631}, {'type': 'track', 'id': 1885124}, {'type': 'track', 'id': 2548942}, {'type': 'track', 'id': 1350486}, {'type': 'track', 'id': 3734368}, {'type': 'track', 'id': 3559010}, {'type': 'track', 'id': 1843196}, {'type': 'track', 'id': 1480428}, {'type': 'track', 'id': 3287322}, {'type': 'track', 'id': 2289919}, {'type': 'track', 'id': 2279338}, {'type': 'track', 'id': 2279191}, {'type': 'track', 'id': 341161}, {'type': 'track', 'id': 2949622}, {'type': 'track', 'id': 1210794}, {'type': 'track', 'id': 3221713}, {'type': 'track', 'id': 2288588}, {'type': 'track', 'id': 2983307}, {'type': 'track', 'id': 3579068}, {'type': 'track', 'id': 228904}, {'type': 'track', 'id': 3427000}, {'type': 'track', 'id': 1854456}, {'type': 'track', 'id': 2522599}, {'type': 'track', 'id': 2805550}, {'type': 'track', 'id': 2949054}, {'type': 'track', 'id': 

In [24]:
playlist_df['tracks_ids'] = playlist_df[4].apply(get_array_attr, attr='objects')
playlist_df = playlist_df.where(playlist_df['tracks_ids'].apply(len) > 4).dropna()

playlist_df.drop([0, 1, 2, 3, 4], axis=1, inplace=True)
playlist_df = playlist_df.sample(frac=1.0).reset_index(drop=True)

print(playlist_df.shape)
playlist_df.head()

(39527, 1)


Unnamed: 0,tracks_ids
0,"[392743, 1367698, 3842061, 3738843, 429299, 17..."
1,"[1545373, 3674231, 4653038, 164951, 1931409, 4..."
2,"[1455561, 2991484, 1688389, 2448002, 412025, 3..."
3,"[3245000, 2182616, 1444718, 901521, 2698372, 6..."
4,"[2410569, 826164, 976247, 825195, 1074887, 239..."


**Move to author**

In [30]:
playlist_df['artists_ids'] = playlist_df['tracks_ids'].apply(track_array_to_artists)
playlist_df.drop(['tracks_ids'], axis=1, inplace=True)

print(playlist_df.shape)
playlist_df.head()

(39527, 1)


Unnamed: 0,artists_ids
0,"[48160, 309884, 156962, 237219, 84804, 255908,..."
1,"[257088, 31010, 165764, 223368, 345801, 21612,..."
2,"[447521, 306507, 50476, 86738, 211702, 370170]"
3,"[278529, 457347, 170245, 122374, 82955, 196876..."
4,"[53987, 188004, 292676, 31464, 227530, 56683, ..."


In [30]:
playlist_df.to_csv('playlist.csv')
del playlist_df

print('File playlist.idomaar processed successfully!')

## Sessions

In [69]:
sessions_path = '../Data/relations/sessions.idomaar'
sessions_df = pd.read_csv(sessions_path, sep='\t', header=None)

print(sessions_df.shape)
sessions_df.head()

(2764474, 4)


Unnamed: 0,0,1,2,3
0,event.session,287144,1390231051,"{""numtracks"":23,""playtime"":4547} {""subjects"":[..."
1,event.session,287145,1390241844,"{""numtracks"":11,""playtime"":2907} {""subjects"":[..."
2,event.session,287146,1390303249,"{""numtracks"":16,""playtime"":3191} {""subjects"":[..."
3,event.session,287147,1390481828,"{""numtracks"":5,""playtime"":1162} {""subjects"":[{..."
4,event.session,287140,1421443687,"{""numtracks"":2,""playtime"":250} {""subjects"":[{""..."


In [None]:
sessions_df[3] = sessions_df[3].apply(lambda s: s.split())

sessions_df['meta'] = sessions_df[3].apply(lambda x: x[0])
sessions_df['subjects'] = sessions_df[3].apply(lambda x: x[1])

print(sessions_df.shape)
sessions_df.head()

In [None]:
sessions_df['tracks_ids'] = sessions_df['subjects'].apply(get_array_attr, attr='objects')
sessions_df['play_ratios'] = sessions_df['subjects'].apply(get_array_attr,
                                                           attr='objects', id_='playratio')
sessions_df['play_ratios'] = sessions_df['play_ratios'].apply(replace_none, replacer=1.0)
sessions_df['artists_ids'] = sessions_df['tracks_ids'].apply(track_array_to_artists)
sessions_df['user'] = sessions_df['subjects'].apply(get_array_attr, attr='subjects')
sessions_df['user'] = sessions_df['user'].apply(lambda x: x[0])

sessions_df['numtracks'] = sessions_df['meta'].apply(get_attr, attr='numtracks')
sessions_df['playtime'] = sessions_df['meta'].apply(get_attr, attr='playtime')

sessions_df = sessions_df.where(sessions_df['numtracks'] > 4).dropna()

sessions_df.drop([0, 1, 2, 3, 'meta', 'subjects'], axis=1, inplace=True)
sessions_df = sessions_df.sample(frac=1.0).reset_index(drop=True)

print(sessions_df.shape)
sessions_df.head()

In [10]:
sessions_df.to_csv('sessions.csv')
del sessions_df

print('File sessions.idomaar processed successfully!')

File sessions.idomaar processed successfully!
