# Preprocess data for machine learning

### October 11, 2017
### Tiffany Huang

In [1]:
import pandas as pd
import numpy as np

# Read the data into panda data frames.
members_data = pd.read_csv('members.csv')
song_data = pd.read_csv('songs.csv')
raw_train_data = pd.read_csv('train.csv')
raw_test_data = pd.read_csv('test.csv')

In [2]:
# Gather all the features together into one mega train and one mega test file
train_data = pd.merge(raw_train_data, members_data, how = 'left', left_on = 'msno', right_on = 'msno')
train_data = pd.merge(train_data, song_data, how = 'left', left_on = 'song_id', right_on = 'song_id')

test_data = pd.merge(raw_test_data, members_data, how = 'left', left_on = 'msno', right_on = 'msno')
test_data = pd.merge(raw_test_data, song_data, how = 'left', left_on = 'song_id', right_on = 'song_id')

train_and_test_data = [train_data, test_data]

In [3]:
# Print feature names and a snapshot of training data
print(train_data.columns.values)
print(train_data.head())

print(train_data.columns[train_data.isnull().any()].tolist())

['msno' 'song_id' 'source_system_tab' 'source_screen_name' 'source_type'
 'target' 'city' 'bd' 'gender' 'registered_via' 'registration_init_time'
 'expiration_date' 'song_length' 'genre_ids' 'artist_name' 'composer'
 'lyricist' 'language']
                                           msno  \
0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   
1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   
2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   
3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   
4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   

                                        song_id source_system_tab  \
0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   
1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   
2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   
3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   
4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   

    source_screen_name      source_typ

# Data Analysis

1. Incomplete Fields:

   - source_system_tab - Not worth
   - source_screen_name - Not worth
   - source_type - Maybe worth
   - bd (age) - Need to fix outlier values like 0
   - gender - Should fill this in, how?
   - song_length - Maybe should fill this in, how?
   - genre_ids - Should fill this in, how?
   - artist_name - Maybe not worth
   - composer - Maybe not worth
   - lyricist - Maybe not worth
   - language - Not worth


2. Fields to Remove:
   - msno
   - song_id
   - source_system_tab
   - source_screen_name
   - registered_via
   - registration_init_time
   - expiration_date
   - artist_name? Without any prior knowledge of this person's taste, this is probably not helpful
   - composer?
   - lyricist?
   - language


3. Creating New Features:
   - Change source_type to local or online -> 0 or 1
   - Change ages to ranges (quartile?) -> 0 - 4
   - Change gender to 0 or 1
   - Change song length to ranges
   

In [5]:
train_data['bd'].value_counts()

 0       2940499
 25       261906
 24       258688
 26       257869
 22       255731
 27       253065
 28       232548
 23       228255
 21       222087
 29       211192
 30       203157
 20       189439
 31       161844
 32       151544
 19       141066
 33       127455
 18       124868
 34       115324
 35       112529
 17        87119
 36        82632
 37        79722
 38        75166
 40        54284
 39        52676
 41        51378
 16        42164
 43        35301
 44        33542
 45        31929
          ...   
 3           507
 82          447
 5           392
 73          376
 10          361
 13          345
 2           323
 68          311
 11          290
 931         247
 12          222
 95          203
 87          154
 144         152
 131         146
-43          142
 107          94
 112          90
-38           53
 102          46
 83           40
 70           33
 106          23
 90           15
 85           12
 89           11
 93           10
 78           