##Exploratory Data Analysis on the MXM Data

In [29]:
import pylab
%matplotlib inline
import pandas as pd
import matplotlib.cm as cm #for colormapping later
import matplotlib.pyplot as plt
import numpy as np 

In [44]:
trainfile = './mxm_data/mxm_dataset_train.txt'
testfile = './mxm_data/mxm_dataset_test.txt'

In [45]:
#Header info is in the first 17 lines for the train file (and same for test file)
ftrain = open(trainfile, 'r')
ftest = open(testfile,'r')
print('Header Info for Training Set:')
for i in range(17):
    print(ftrain.readline())
#Header info is in the first 17 lines for the test file as well
print('Header Info for Testing Set:')
for i in range(17):
    print(ftest.readline())
    
#line 18 for each one is the top words
train_top_words = ftrain.readline()
test_top_words = ftest.readline()
print('First few words of train top words : ')
print(train_top_words[0:100])
print('Should be i,the,you,to...')

print('First few words of test top words : ')
print(test_top_words[0:100])
print('Should be i,the,you,to...')  

#subsequent lines in the test and train sets should be "normal lines" 
#as described by the header text.
#read those in!
test = ftest.readlines() #this will pick up where we left off and read the remaining lines
train = ftrain.readlines() 
print('There are ', len(train), ' songs in the train set')
print('There are ', len(test), ' songs in the test set')

#since we read in all of the data, we can now close the files
ftest.close()
ftrain.close()

Header Info for Training Set:
# TRAINING SET

# MusiXmatch dataset, the official lyrics dataset

# of the Million Song Dataset

#    file created on Tue Mar 29 04:28:44 2011

#    contact: T. Bertin-Mahieux (Columbia University)

#             tb2332@columbia.edu

#    also: http://labrosa.ee.columbia.edu/millionsong/musixmatch

#          http://www.musixmatch.com

# FORMAT:

#     #   - comment, to ignore

#     %   - list of top words, comma-separated

#         - normal line, contains track_id, mxm track id,

#           then word count for each of the top words, comma-separated

#           word count is in sparse format -> ...,<word idx>:<cnt>,...

#           <word idx> starts at 1 (not zero!)

# All our work is done using UTF-8 encoding.

# enjoy!

Header Info for Testing Set:
# TESTING SET

# MusiXmatch dataset, the official lyrics dataset

# of the Million Song Dataset

#    file created on Tue Mar 29 04:28:44 2011

#    contact: T. Bertin-Mahieux (Columbia University)

#    

In [46]:
#are train_top_words and test_top_words identical? you'd think they should be
is_true = (train_top_words == test_top_words)
print('Train Top Words == Test Top Words ? : ', is_true)

Train Top Words == Test Top Words ? :  True


In [47]:
#they are they same so let's just call one of them "top words" and use that from now on:
top_words = test_top_words

In [48]:
len(top_words) #this is the number of characters, let's try to split them into words

29173

In [49]:
#notice that the first word has a % character in front of it.
#remove that character:
top_words = top_words[1:]
print('First few words of top words : ')
print(train_top_words[0:100])
print('Should be i,the,you,to...')

First few words of top words : 
%i,the,you,to,and,a,me,it,not,in,my,is,of,your,that,do,on,are,we,am,will,all,for,no,be,have,love,so,
Should be i,the,you,to...


In [50]:
#split into individual words
train_top_words = train_top_words.split(',')
test_top_words = test_top_words.split(',')
top_words = top_words.split(',')
print('First 10 words: ', top_words[0:10])

First 10 words:  ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in']


In [52]:
print('Most Common 100 words: ', top_words[0:100])

Most Common 100 words:  ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in', 'my', 'is', 'of', 'your', 'that', 'do', 'on', 'are', 'we', 'am', 'will', 'all', 'for', 'no', 'be', 'have', 'love', 'so', 'know', 'this', 'but', 'with', 'what', 'just', 'when', 'like', 'now', 'que', 'time', 'can', 'come', 'de', 'there', 'go', 'up', 'oh', 'la', 'one', 'they', 'out', 'down', 'get', 'she', 'was', 'see', 'if', 'got', 'never', 'from', 'he', 'feel', 'want', 'let', 'make', 'way', 'say', 'take', 'would', 'as', 'ca', 'day', 'at', 'babi', 'away', 'life', 'yeah', 'y', 'back', 'by', 'her', 'heart', 'here', 'how', 'could', 'night', 'need', 'our', 'look', 'where', 'en', 'eye', 'thing', 'world', 'more', 'caus', 'gonna', 'die', 'right', 'been', 'tell']


> ^^ It's strange that the word "ca" is in there.

In [53]:
# have a look at some of the song data:
print('First train data song : ')
print(train[0])

First train data song : 
TRAAAAV128F421A322,4623710,1:6,2:4,3:2,4:2,5:5,6:3,7:1,8:1,11:1,12:2,13:3,14:1,15:1,18:2,19:2,20:2,21:2,23:4,25:1,26:2,28:1,30:1,36:2,42:1,45:1,54:2,56:1,57:1,68:1,99:1,192:2,249:1,264:1,356:1,389:1,561:1,639:1,656:1,687:1,761:1,773:1,804:1,869:2,914:1,1035:1,1156:1,1221:1,1287:1,1364:1,1407:1,1533:2,1857:1,2096:1,2117:1,2482:2,2548:1,2705:1,2723:1,2868:2,2992:2,3455:1,3717:1,3851:1,4322:1,4382:1,4613:1,4713:1,4906:1



In [54]:
# we probably want to store this data in some way other than one long string for each song.
foo = train[0]
foo = foo.split(',')

In [55]:
foo[0]

'TRAAAAV128F421A322'

In [56]:
foo[1]

'4623710'

In [70]:
song_dict = {}
keys_and_vals = foo[2:]
print(keys_and_vals)
print(keys_and_vals[-1])
for entry in keys_and_vals:
    entry = entry.split(':')
    key = int(entry[0])
    val = int(entry[1])
    song_dict[key]=val

['1:6', '2:4', '3:2', '4:2', '5:5', '6:3', '7:1', '8:1', '11:1', '12:2', '13:3', '14:1', '15:1', '18:2', '19:2', '20:2', '21:2', '23:4', '25:1', '26:2', '28:1', '30:1', '36:2', '42:1', '45:1', '54:2', '56:1', '57:1', '68:1', '99:1', '192:2', '249:1', '264:1', '356:1', '389:1', '561:1', '639:1', '656:1', '687:1', '761:1', '773:1', '804:1', '869:2', '914:1', '1035:1', '1156:1', '1221:1', '1287:1', '1364:1', '1407:1', '1533:2', '1857:1', '2096:1', '2117:1', '2482:2', '2548:1', '2705:1', '2723:1', '2868:2', '2992:2', '3455:1', '3717:1', '3851:1', '4322:1', '4382:1', '4613:1', '4713:1', '4906:1\n']
4906:1



In [71]:
song_dict

{1: 6,
 2: 4,
 3: 2,
 4: 2,
 5: 5,
 6: 3,
 7: 1,
 8: 1,
 11: 1,
 12: 2,
 13: 3,
 14: 1,
 15: 1,
 18: 2,
 19: 2,
 20: 2,
 21: 2,
 23: 4,
 25: 1,
 26: 2,
 28: 1,
 30: 1,
 36: 2,
 42: 1,
 45: 1,
 54: 2,
 56: 1,
 57: 1,
 68: 1,
 99: 1,
 192: 2,
 249: 1,
 264: 1,
 356: 1,
 389: 1,
 561: 1,
 639: 1,
 656: 1,
 687: 1,
 761: 1,
 773: 1,
 804: 1,
 869: 2,
 914: 1,
 1035: 1,
 1156: 1,
 1221: 1,
 1287: 1,
 1364: 1,
 1407: 1,
 1533: 2,
 1857: 1,
 2096: 1,
 2117: 1,
 2482: 2,
 2548: 1,
 2705: 1,
 2723: 1,
 2868: 2,
 2992: 2,
 3455: 1,
 3717: 1,
 3851: 1,
 4322: 1,
 4382: 1,
 4613: 1,
 4713: 1,
 4906: 1}

In [76]:

song_dict.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 656, 2705, 18, 19, 20, 21, 23, 25, 26, 28, 30, 389, 2992, 1156, 2723, 804, 3717, 42, 1287, 4613, 45, 687, 2096, 561, 2482, 773, 4382, 54, 56, 57, 4906, 192, 1857, 1035, 68, 1221, 3851, 3455, 1364, 761, 36, 2117, 4322, 99, 356, 869, 4713, 914, 2548, 2868, 249, 1407, 1533, 264, 639])

In [77]:
song_dict.values()

dict_values([6, 4, 2, 2, 5, 3, 1, 1, 1, 2, 3, 1, 1, 1, 1, 2, 2, 2, 2, 4, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1])

##TO DO
### Above I converted one song's data into a dictionary, but we should do this for each song in the test and train datasets.
### Maybe make a Pandas DataFrame with columns : 'ID1','ID2','SongDict' ?
### Then, sum all the counts and plot rank vs. frequency to see if it follows a power law
### Could also use the total counts to make a word cloud where words are sized by their frequency
