In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [3]:
df_train.columns

Index([u'genre_id', u'ts_listen', u'media_id', u'album_id', u'context_type',
       u'release_date', u'platform_name', u'platform_family',
       u'media_duration', u'listen_type', u'user_gender', u'user_id',
       u'artist_id', u'user_age', u'is_listened'],
      dtype='object')

In [4]:
df_train.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened
0,25471,1480597215,222606,41774,12,20040704,1,0,223,0,0,9241,55164,29,0
1,25571,1480544735,250467,43941,0,20060301,2,1,171,0,0,16547,55830,30,1
2,16,1479563953,305197,48078,1,20140714,2,1,149,1,1,7665,2704,29,1
3,7,1480152098,900502,71521,0,20001030,0,0,240,0,1,1580,938,30,0
4,7,1478368974,542335,71718,0,20080215,0,0,150,0,1,1812,2939,24,1


In [5]:
y = df_train["is_listened"].values

In [6]:
categorical_cols = ["genre_id",
                   "media_id",
                   "album_id",
                   "context_type",
                   "release_date",
                   "platform_name",
                   "platform_family",
                   "listen_type",
                   "user_gender",
                   "user_id",
                   "artist_id",
                   "user_age"]

### Let's inspect the categorical variables!

In [7]:
for col in categorical_cols:
    print("{}: {}".format(col, len(df_train[col].unique())))

genre_id: 2922
media_id: 452975
album_id: 151471
context_type: 74
release_date: 8902
platform_name: 3
platform_family: 3
listen_type: 2
user_gender: 2
user_id: 19918
artist_id: 67142
user_age: 13


### Strategy
1. Apply onehot to categorical with few classes
2. Rank categories to artist id
3. target encoder for context and genre
4. discard media, album, release date and user_id

In [8]:
from greenpyce.feature_engineering import TargetEncoder
from greenpyce.feature_engineering import onehot
from greenpyce.feature_engineering import RankCategorical
from greenpyce.feature_engineering import LabelCount

In [9]:
cols_for_target_encoder = ["genre_id", "context_type"]
cols_for_onehot = ["platform_name", "platform_family", "listen_type", "user_gender", "user_age"]
cols_for_rank = ["artist_id"]

target = "is_listened"

In [10]:
te = TargetEncoder(cols_for_target_encoder, "is_listened")

In [11]:
te.fit(df_train)

In [12]:
te.transform(df_train)
te.transform(df_test)

In [13]:
rc = RankCategorical(cols_for_rank)
rc.fit(df_train)
rc.transform(df_train)
rc.transform(df_test)

In [None]:
# lc = RankCategorical(cols_for_count)
# lc.fit(df_train)
# lc.transform(df_train)
# lc.transform(df_test)

In [14]:
df_train.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened
0,0.688147,1480597215,222606,41774,0.695604,20040704,1,0,223,0,0,9241,66570,29,0
1,0.567614,1480544735,250467,43941,0.688532,20060301,2,1,171,0,0,16547,11629,30,1
2,0.744035,1479563953,305197,48078,0.573604,20140714,2,1,149,1,1,7665,4816,29,1
3,0.663155,1480152098,900502,71521,0.688532,20001030,0,0,240,0,1,1580,1115,30,0
4,0.663155,1478368974,542335,71718,0.688532,20080215,0,0,150,0,1,1812,3491,24,1


In [15]:
df_train = onehot(df_train, cols_for_onehot)
df_test = onehot(df_test, cols_for_onehot)

In [16]:
df_train.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,media_duration,user_id,artist_id,is_listened,...,user_age_21,user_age_22,user_age_23,user_age_24,user_age_25,user_age_26,user_age_27,user_age_28,user_age_29,user_age_30
0,0.688147,1480597215,222606,41774,0.695604,20040704,223,9241,66570,0,...,0,0,0,0,0,0,0,0,1,0
1,0.567614,1480544735,250467,43941,0.688532,20060301,171,16547,11629,1,...,0,0,0,0,0,0,0,0,0,1
2,0.744035,1479563953,305197,48078,0.573604,20140714,149,7665,4816,1,...,0,0,0,0,0,0,0,0,1,0
3,0.663155,1480152098,900502,71521,0.688532,20001030,240,1580,1115,0,...,0,0,0,0,0,0,0,0,0,1
4,0.663155,1478368974,542335,71718,0.688532,20080215,150,1812,3491,1,...,0,0,0,1,0,0,0,0,0,0


In [17]:
#df_train = rank_categorical(df_train, cols_for_rank)

In [18]:
df_train.columns.values

array(['genre_id', 'ts_listen', 'media_id', 'album_id', 'context_type',
       'release_date', 'media_duration', 'user_id', 'artist_id',
       'is_listened', 'platform_name_1', 'platform_name_2',
       'platform_family_1', 'platform_family_2', 'listen_type_1',
       'user_gender_1', 'user_age_19', 'user_age_20', 'user_age_21',
       'user_age_22', 'user_age_23', 'user_age_24', 'user_age_25',
       'user_age_26', 'user_age_27', 'user_age_28', 'user_age_29',
       'user_age_30'], dtype=object)

In [19]:
### Drop
to_drop = ['ts_listen', 'media_id', 'album_id', 'release_date', 'media_duration', 'user_id', 'artist_id']
df_train.drop(to_drop, axis = 1,inplace=True)
df_test.drop(to_drop, axis = 1,inplace=True)

In [20]:
features = [col for col in df_train.columns.values if col != target]
features

['genre_id',
 'context_type',
 'platform_name_1',
 'platform_name_2',
 'platform_family_1',
 'platform_family_2',
 'listen_type_1',
 'user_gender_1',
 'user_age_19',
 'user_age_20',
 'user_age_21',
 'user_age_22',
 'user_age_23',
 'user_age_24',
 'user_age_25',
 'user_age_26',
 'user_age_27',
 'user_age_28',
 'user_age_29',
 'user_age_30']

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_train[features], df_train[target], test_size = 0.2)

In [23]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [24]:
model.score(X_test, y_test)

0.70351581956743336

0.70345496362865445

In [25]:
from sklearn.metrics import confusion_matrix

In [26]:
confusion_matrix(y_test, model.predict(X_test))

array([[118877, 359265],
       [ 88950, 944675]])

In [68]:
import gzip
import csv
import numpy as np

            
def build_kaggle_submission(preds):
    with gzip.open('my_submission.csv.gz', 'wt') as outf:
        fo = csv.writer(outf, lineterminator='\n')
        fo.writerow(["sample_id", "is_listened"])
        
        for i, pred in enumerate(preds):
            fo.writerow([i, pred])

In [69]:
p = model.predict_proba(df_test[features])

In [70]:
p = p[:, 1]

In [71]:
build_kaggle_submission2(p)