In [32]:
import json
import datetime
import pandas as pd

In [39]:
# read data from the json file
with open('/Users/miffyvo/Desktop/sc_data_science_challenge.json') as data_file:    
    data = json.load(data_file)

In [40]:
# print all the keys available
print data.keys()

# extract the column and data arrays
columns = data['columns']

# append skip column
columns.append("skip_ind")

# extract the data from json
data = data['data']

# initialize the skip index column
for d in data:
    d.append(0)

# number of data items
print "ncol:", len(columns)
print "nrow:", len(data)

[u'data', u'columns']
ncol: 16
nrow: 828169


Define <br/>
* $l\in L$ set of listeners <br/>
* $d_l$ signup date of listener $l$ <br/>
* $m\in M_l$ set of recommended tracks listened by listener $l$ <br/>
* $v_{m, l}$ client version of the software used by listener $l$ to listen to track $m$ <br/>
* $z_{m, l}$ country where listener $l$ listened to track $m$ <br/>
* $c_{m, l}$ top genre category identified at the time listener $l$ listened to track $m$ <br/>
* $\tau_{m, l}$ total listening time by listener $l$ last month <br/>
* $n_{m, l}$ average number of daily tracks listened by listener $l$ last month <br/>
* $t_{m, l}$ start time of track $m$ by listener $l$ <br/>
* $c_m$ genre category of track $m$ <br/>
* $u_m$ upload date of track $m$ <br/>
* $d_m$ duration of track $m$ <br/>
* $e_{m, l}$ elapsed/listening time of track $m$ by listener $l$ <br/>
* $w_{m, l}$ section of the SoundCloud web app where listener $l$ listened to the track $m$ <br/>
* $a_{m, l}$ algorithm used to recommend track $m$ to listener $l$ <br/>

A recommended track $m$ is recognized as skipped when the track has more than $20\%$ remaining ($d_{m,l} - e_{m,l} > 0.2 * d_{m,l}$).

In [41]:
i_track_duration = columns.index("track_duration")
i_listen_duration = columns.index("listen_duration")
i_skip = columns.index("skip_ind")
delta = 0.2

for d in data:
    d[i_skip] = 1 if d[i_track_duration] - d[i_listen_duration] > delta * d[i_track_duration] else 0

In [43]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets

In [254]:
pdata = pd.DataFrame(data = data, columns = columns)

In [255]:
Y = pdata["skip_ind"]
xvars = list(columns)

# remove listener_id from X as the maximum count of recommended tracks 
# for a user is only 0.6% of the entire track population
# print pdata["listener_id"].value_counts()
xvars.remove("listener_id")

# remove track_id from X as the maximum count of recommended tracks 
# for a track is only 0.15% of the entire track population
# print pdata["track_id"].value_counts()
xvars.remove("track_id")

# remove listen_duration from X as it is part of the Y
xvars.remove("listen_duration")

# extract X
X = pdata[xvars]

In [256]:
# handle country code
# only US and GB
# print pdata["country_code"].value_counts()
X['country_code'] = np.where(X['country_code'] == 'US', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [257]:
# handle client version
# set 1 for version 204.0.0 as it accounts for almost half of the population
# print pdata["client_version"].value_counts()
X['client_version'] = np.where(X['client_version'] == '204.0.0', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [258]:
# handle listening context
# need to convert them to dummy variables as they are quite distributed
# print pdata["listening_context"].value_counts()
dummy = pd.get_dummies(X["listening_context"], prefix="lc")
dummy.drop('lc_you', axis=1, inplace=True)

# add the dummy columns back to X
X = X.join(dummy)

# drop the origin column
X.drop('listening_context', axis=1, inplace=True)

In [259]:
# handle recommender algorithms
# print pdata["recommender_algorithm_name"].value_counts()
dummy = pd.get_dummies(X["recommender_algorithm_name"], prefix="ra")
dummy.drop('ra_fallback', axis=1, inplace=True)

# add the dummy columns back to X
X = X.join(dummy)

# drop the origin column
X.drop('recommender_algorithm_name', axis=1, inplace=True)

In [260]:
# handle track genre category
print pdata["track_genre_category"].value_counts()

X['tc_hiphop'] = np.where(X['track_genre_category'] == 'HipHop & R&B', 1, 0)
X['tc_dance'] = np.where(X['track_genre_category'] == 'Dance & Electronic', 1, 0)

# drop the origin column
X.drop('track_genre_category', axis=1, inplace=True)

HipHop & R&B          439394
Dance & Electronic    244999
Pop                    59622
Rock                   37748
World                  25095
Reggae                  5687
Speech                  4488
Latin                   2931
Classical               2807
Jazz                    1846
Metal                   1518
Country                 1123
Unknown                  911
Name: track_genre_category, dtype: int64


In [261]:
# handle listener_top_genre_category
print pdata["listener_top_genre_category_listened"].value_counts()

X['lt_hiphop'] = np.where(X['listener_top_genre_category_listened'] == 'HipHop & R&B', 1, 0)
X['lt_dance'] = np.where(X['listener_top_genre_category_listened'] == 'Dance & Electronic', 1, 0)

# drop the origin column
X.drop('listener_top_genre_category_listened', axis=1, inplace=True)

HipHop & R&B          542346
Dance & Electronic    188723
Pop                    35766
Rock                   21990
World                  21534
Speech                  5310
Latin                   2521
Reggae                  2498
Classical               1529
Metal                   1362
Jazz                     501
Country                  312
Name: listener_top_genre_category_listened, dtype: int64


In [262]:
def ts2date(s):
    return datetime.datetime.fromtimestamp(int(s)).strftime('%Y-%m-%d %H:%M:%S')

def ts2dow(s):
    return datetime.datetime.fromtimestamp(int(s)).strftime('%a')

def ts2year(s):
    return datetime.datetime.fromtimestamp(int(s)).strftime('%Y')

def ts2month(s):
    return datetime.datetime.fromtimestamp(int(s)).strftime('%m')

In [263]:
timecols = ["ts", "track_upload_date", "listener_signup_date"]
prefixes = ["ts", "ud", "sd"]

for i in range(0, len(timecols)):
    col = timecols[i]
    prefix = prefixes[i]
    dummy = X.apply(lambda x: ts2dow(x[col]), axis=1)
    dummy = pd.get_dummies(dummy, prefix=prefix + "_dow")
    
    # remove the last dummy variable
    t = dummy.columns.values
    t = t[len(t) - 1]
    dummy.drop(t, axis=1, inplace=True)

    # add the dummy columns back to X
    X = X.join(dummy)

In [264]:
for i in range(0, len(timecols)):
    col = timecols[i]
    prefix = prefixes[i]
    dummy = X.apply(lambda x: ts2month(x[col]), axis=1)
    dummy = pd.get_dummies(dummy, prefix=prefix + "_mon")
    
    # remove the last dummy variable
    t = dummy.columns.values
    t = t[len(t) - 1]
    dummy.drop(t, axis=1, inplace=True)
    
    # add the dummy columns back to X
    X = X.join(dummy)

In [265]:
# skip i = 0, as the years of all ts are the same
i = 1
col = timecols[i]
prefix = prefixes[i]
dummy = X.apply(lambda x: ts2year(x[col]), axis=1)
#dummy.value_counts()
dummy_2016 = pd.DataFrame(data = np.where(dummy == '2016', 1, 0), columns = [prefix + "_year_2016"])
dummy_2015 = pd.DataFrame(data = np.where(dummy == '2015', 1, 0), columns = [prefix + "_year_2015"])
X = X.join(dummy_2016)
X = X.join(dummy_2015)

In [266]:
# skip i = 0, as the years of all ts are the same
i = 2
col = timecols[i]
prefix = prefixes[i]
dummy = X.apply(lambda x: ts2year(x[col]), axis=1)
#dummy.value_counts()
dummy_2016 = pd.DataFrame(data = np.where(dummy == '2016', 1, 0), columns = [prefix + "_year_2016"])
dummy_2015 = pd.DataFrame(data = np.where(dummy == '2015', 1, 0), columns = [prefix + "_year_2015"])
dummy_2014 = pd.DataFrame(data = np.where(dummy == '2014', 1, 0), columns = [prefix + "_year_2014"])
dummy_2013 = pd.DataFrame(data = np.where(dummy == '2013', 1, 0), columns = [prefix + "_year_2013"])
X = X.join(dummy_2016)
X = X.join(dummy_2015)
X = X.join(dummy_2014)
X = X.join(dummy_2013)

In [267]:
# drop the origin timestamp columns
for i in timecols:
    X.drop(i, axis=1, inplace=True)

In [268]:
# convert to minutes
X['listener_prev_month_listening_time'] = X['listener_prev_month_listening_time'] / 1000 / 60

In [269]:
# convert to minutes
X['track_duration'] = X['track_duration'] / 1000 / 60

In [283]:
X['skip_ind'] = Y
X = X.dropna()
Y = X['skip_ind']
X.drop('skip_ind', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [286]:
X['skip_ind'] = Y
X.to_csv("/Users/miffyvo/Desktop/ds_full.txt", sep='\t', encoding='utf-8')

In [284]:
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X, Y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)