In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from time import time
%matplotlib inline

#### Load data

In [2]:
# load csv files
train_df = pd.read_csv("train.csv")
profiles_df = pd.read_csv("profiles.csv")
artists_df = pd.read_csv("artists.csv")
artists_with_genres_df = pd.read_csv("artists_with_genres.csv")
user_median_df = pd.read_csv("user_median.csv")
global_median_df = pd.read_csv("global_median.csv")

#### Calculate mean ages per gender, country combination

In [3]:
# calculate mean age per user
mean_age = profiles_df[~profiles_df['age'].isnull()]["age"].mean()

###################################################################### 

# calculate mean age per men
males_only_df = profiles_df[profiles_df["sex"]=='m']
mean_male_age = males_only_df[~males_only_df['sex'].isnull()]["age"].mean()

###################################################################### 

# calculate mean age per women
females_only_df = profiles_df[profiles_df["sex"]=='f']
mean_female_age = females_only_df[~females_only_df['sex'].isnull()]["age"].mean()

###################################################################### 

# get list of unique countries
countries = profiles_df["country"].unique()

# calculate mean age per country
mean_country_age = {}
for country in countries:
    tmp_df = profiles_df[profiles_df["country"]==str(country)]
    mean_country_age[str(country)] = tmp_df["age"].mean()

###################################################################### 
    
# calculate mean age per male per country
mean_country_male_age = {}
for country in countries:
    tmp_df0 = profiles_df[profiles_df["sex"]=='m']
    tmp_df1 = tmp_df0[profiles_df["country"]==str(country)]
    mean_country_male_age[str(country)] = tmp_df1["age"].mean()

###################################################################### 

# calculate mean age per female per country
mean_country_female_age = {}
for country in countries:
    tmp_df0 = profiles_df[profiles_df["sex"]=='f']
    tmp_df1 = tmp_df0[profiles_df["country"]==str(country)]
    mean_country_female_age[str(country)] = tmp_df1["age"].mean()



#### Create training dataframe with indicator variable columns and missing age values filled in

In [None]:
# initialize training df
training_df = profiles_df

###################################################################### 

# create sex indicators
training_df["male"] = 0
training_df["female"] = 0
training_df["sex_missing"] = 0
training_df.loc[training_df["sex"] =='m', "male"] = 1
training_df.loc[training_df["sex"] =='f', "female"] = 1
training_df.loc[training_df["sex"].isnull(), "sex_missing"] = 1

######################################################################  

# create age indicators
training_df["age_missing"] = 0
training_df.loc[training_df["age"].isnull(),"age_missing"] = 1

# fill in mean age: has no gender or country
training_df.loc[training_df["age_missing"] == 1, "age"] = mean_age

# fill in mean age: has gender, no country
#males
condition = (training_df["sex"] =='m') & (training_df["country"].isnull()) & (training_df["age_missing"] == 1)
training_df.loc[condition, "age"] = mean_male_age
# females
condition = (training_df["sex"] =='f') & (training_df["country"].isnull()) & (training_df["age_missing"] == 1)
training_df.loc[condition, "age"] = mean_female_age

# fill in mean age: has country, no gender
#loop over countries
for country in countries:
    condition = (training_df["country"] == country) & (training_df["age_missing"] == 1) & (training_df["sex_missing"] == 1)
    training_df.loc[condition, "age"] = mean_country_age[country]

# fill in mean age: has gender and country
# loop over countries
for country in countries:
    # males
    condition = (training_df["country"] == country) & (training_df["sex"] == 'm') & (training_df["age_missing"] == 1)
    training_df.loc[condition, "age"] = mean_country_male_age[country]
    # females
    condition = (training_df["country"] == country) & (training_df["sex"] == 'f') & (training_df["age_missing"] == 1)
    training_df.loc[condition, "age"] = mean_country_female_age[country]

###################################################################### 
    
# create country indicators
training_df["country_missing"] = 0
training_df.loc[training_df["country"].isnull(), "country_missing"] = 1
# loop over countries
for country in countries:
    # assign 1 if in that country
    training_df.loc[training_df["country"] == country, country] = 1
    # assign 0 otherwise
    training_df.loc[training_df["country"] != country, country] = 0

#### Transfer plays per artist data from train df to our training df. one column per artist

In [None]:
# Initialize all artist columns to 0
artists = train_df["artist"].unique()
for artist in artists:
    training_df[artist]=0
 
######################################################################

# create helper function to keep track of loop progress and est. time remaining
def progress(step,steps,t0):
    for i in range(1,10):
        if (step == (steps/10)*i):
            percent_complete = int(100*((steps/10)*i)/steps)
            seconds_remaining = int(((time() - t0) / i)*(10-i))
            print "\r" + str(percent_complete) + "% complete " + str(seconds_remaining) + " seconds remaining",

######################################################################

# set index to user for faster computation
training_df = training_df.set_index("user")
counter = 0
t0 = time()

#########################
# WARNING: TAKES ~80Min #
#########################

# loop through each row in train_df and plug play value into training_df
for index, row in train_df.set_index("user").iterrows():
    training_df.loc[index, row[0]] = row[1] #index = user id, row[0] = artist id, row[1] = plays
    counter += 1
    progress(counter,len(train_df),t0) # print progress (every 10ppts)

# save dataframe
training_df.to_pickle("training_df.pkl")
#training_df = pd.read_pickle(training_df.pkl)