In [2]:
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [3]:
df_train = pd.read_csv("data/train.csv", nrows=10)

In [16]:
[tuple(x) for x in df_train.values]

[(25471,
  1480597215,
  222606,
  41774,
  12,
  20040704,
  1,
  0,
  223,
  0,
  0,
  9241,
  55164,
  29,
  0),
 (25571,
  1480544735,
  250467,
  43941,
  0,
  20060301,
  2,
  1,
  171,
  0,
  0,
  16547,
  55830,
  30,
  1),
 (16,
  1479563953,
  305197,
  48078,
  1,
  20140714,
  2,
  1,
  149,
  1,
  1,
  7665,
  2704,
  29,
  1),
 (7,
  1480152098,
  900502,
  71521,
  0,
  20001030,
  0,
  0,
  240,
  0,
  1,
  1580,
  938,
  30,
  0),
 (7,
  1478368974,
  542335,
  71718,
  0,
  20080215,
  0,
  0,
  150,
  0,
  1,
  1812,
  2939,
  24,
  1),
 (7,
  1478382544,
  542335,
  71718,
  0,
  20080215,
  0,
  0,
  150,
  0,
  1,
  1812,
  2939,
  24,
  1),
 (7,
  1478338409,
  542335,
  71718,
  0,
  20080215,
  0,
  0,
  150,
  0,
  1,
  1812,
  2939,
  24,
  1),
 (7,
  1478353709,
  542335,
  71718,
  1,
  20080215,
  1,
  0,
  150,
  1,
  1,
  10325,
  2939,
  29,
  1),
 (7,
  1479130924,
  542335,
  71718,
  0,
  20080215,
  0,
  0,
  150,
  0,
  1,
  1812,
  2939,
  24,
  1

In [7]:
df_train.columns

Index([u'genre_id', u'ts_listen', u'media_id', u'album_id', u'context_type',
       u'release_date', u'platform_name', u'platform_family',
       u'media_duration', u'listen_type', u'user_gender', u'user_id',
       u'artist_id', u'user_age', u'is_listened'],
      dtype='object')

In [None]:
df_train["ts_datetime"] = df_train.apply(lambda x: datetime.fromtimestamp(x["ts_listen"]), axis=1)

In [None]:
users = df_train["user_id"].unique().tolist()

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

def time_offset_rolling_df_count(data_df_ser, window_i_s, min_periods_i=1, center_b=False):
    def calculate_count_ts(ts):
        """Function (closure) to apply that actually computes the rolling mean"""
        if center_b == False:
            dslice_df_ser = data_df_ser[
                ts-pd.datetools.to_offset(window_i_s).delta+timedelta(0,0,1):
                ts
            ]
            # adding a microsecond because when slicing with labels start and endpoint
            # are inclusive
        else:
            dslice_df_ser = data_df_ser[
                ts-pd.datetools.to_offset(window_i_s).delta/2+timedelta(0,0,1):
                ts+pd.datetools.to_offset(window_i_s).delta/2
            ]
        if  (isinstance(dslice_df_ser, pd.DataFrame) and dslice_df_ser.shape[0] < min_periods_i) or \
            (isinstance(dslice_df_ser, pd.Series) and dslice_df_ser.size < min_periods_i):
            return dslice_df_ser.count()*np.nan   # keeps number format and whether Series or DataFrame
        else:
            return dslice_df_ser.count()

    idx_ser = pd.Series(data_df_ser.index.to_pydatetime(), index=data_df_ser.index)
    count_df_ser = idx_ser.apply(calculate_count_ts)
    return count_df_ser


def time_offset_rolling_df_sum(data_df_ser, window_i_s, min_periods_i=1, center_b=False):
    def calculate_sum_ts(ts):
        """Function (closure) to apply that actually computes the rolling mean"""
        if center_b == False:
            dslice_df_ser = data_df_ser[
                ts-pd.datetools.to_offset(window_i_s).delta+timedelta(0,0,1):
                ts
            ]
            # adding a microsecond because when slicing with labels start and endpoint
            # are inclusive
        else:
            dslice_df_ser = data_df_ser[
                ts-pd.datetools.to_offset(window_i_s).delta/2+timedelta(0,0,1):
                ts+pd.datetools.to_offset(window_i_s).delta/2
            ]
        if  (isinstance(dslice_df_ser, pd.DataFrame) and dslice_df_ser.shape[0] < min_periods_i) or \
            (isinstance(dslice_df_ser, pd.Series) and dslice_df_ser.size < min_periods_i):
            return dslice_df_ser.sum()*np.nan   # keeps number format and whether Series or DataFrame
        else:
            return dslice_df_ser.sum()

    idx_ser = pd.Series(data_df_ser.index.to_pydatetime(), index=data_df_ser.index)
    sum_df_ser = idx_ser.apply(calculate_sum_ts)
    return sum_df_ser

In [None]:
last_x_min = "120min"

In [None]:
list_df_users = []
for user_id in tqdm(users):
    df_user = df_train[(df_train["user_id"] == user_id)].copy()
    df_user = df_user.reset_index(level=0)
    df_user = df_user.set_index("ts_datetime")
    df_user = df_user.sort_index()
    df_rolling_count = time_offset_rolling_df_count(df_user, window_i_s=last_x_min)
    df_rolling_sum = time_offset_rolling_df_sum(df_user, window_i_s=last_x_min)

    df_user["count_listened_{}".format(last_x_min)] = df_rolling_count["is_listened"]
    df_user["sum_listened_{}".format(last_x_min)] = df_rolling_sum["is_listened"]
    df_user = df_user.reset_index(level=0)
    list_df_users.append(df_user)

In [None]:
df_treino_final = pd.concat(list_df_users)