In [1]:
import numpy
import pandas
import csv
from sklearn.preprocessing import Imputer
from sklearn.decomposition import IncrementalPCA   

In [2]:
train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'nmf.csv'

# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = int(plays)

# Compute per-user median.
user_medians = {}
user_mins = {}
user_maxs = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        user_plays.append(plays)
    user_medians[user] = numpy.median(numpy.array(user_plays))
    user_mins[user] = numpy.min(numpy.array(user_plays))
    user_maxs[user] = numpy.max(numpy.array(user_plays))

In [3]:
# Preprocessing
df = pandas.DataFrame.from_dict(train_data)
df = df.fillna(df.median())

In [4]:
for user in df.columns:
    df[user] = (df[user] - user_mins[user]) / ()
R = numpy.array(df.values)

In [5]:
# Decomposition
model = IncrementalPCA()
R = model.fit_transform(R)

In [6]:
R = model.inverse_transform(R)

In [7]:
# Reconstruction
df = pandas.DataFrame(data=R, index=df.index, columns=df.columns)
del R
for user in df.columns:
    df[user] = (df[user] + 1.) * user_medians[user]

In [8]:
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            res    = df.loc[artist, user]
            soln_csv.writerow([id, res])