In [11]:
import os
import numpy as np 
import pandas as pd 
from itertools import islice, cycle
from more_itertools import pairwise

print('Dataset:')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Dataset:


# Preprocessing

In [12]:
df = pd.read_csv('../interactions.csv')
df.head()

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01
2,66991,198453,89,,2018-01-01
3,46791,83486,23,5.0,2018-01-01
4,79313,188770,88,5.0,2018-01-01


In [13]:
df['start_date'] = pd.to_datetime(df['start_date'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1533078 entries, 0 to 1533077
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   user_id     1533078 non-null  int64         
 1   item_id     1533078 non-null  int64         
 2   progress    1533078 non-null  int64         
 3   rating      285356 non-null   float64       
 4   start_date  1533078 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 58.5 MB


## Duplicates

In [15]:
duplicates = df.duplicated(subset=['user_id', 'item_id'], keep=False)
duplicates.sum()

160

In [16]:
df_duplicates = df[duplicates].sort_values(by=['user_id', 'start_date'])
df = df[~duplicates]

In [17]:
df_duplicates = df_duplicates.groupby(['user_id', 'item_id']).agg({
    'progress': 'max',
    'rating': 'max',
    'start_date': 'min'
})
df_duplicates.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 80 entries, (523, 49329) to (158041, 208145)
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   progress    80 non-null     int64         
 1   rating      35 non-null     float64       
 2   start_date  80 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 3.4 KB


In [18]:
df = df.append(df_duplicates.reset_index(), ignore_index=True)
df.info()

  df = df.append(df_duplicates.reset_index(), ignore_index=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532998 entries, 0 to 1532997
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   user_id     1532998 non-null  int64         
 1   item_id     1532998 non-null  int64         
 2   progress    1532998 non-null  int64         
 3   rating      285355 non-null   float64       
 4   start_date  1532998 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 58.5 MB


In [19]:
df.nunique()

user_id       151600
item_id        59599
progress         101
rating             5
start_date       730
dtype: int64

Memory saving

In [24]:
df_user_item = df[['user_id', 'item_id']].copy()

In [25]:
def num_bytes_format(num_bytes, float_prec=4):
    units = ['bytes', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb', 'Eb']
    for unit in units[:-1]:
        if abs(num_bytes) < 1000:
            return f'{num_bytes:.{float_prec}f} {unit}'
        num_bytes /= 1000
    return f'{num_bytes:.4f} {units[-1]}'

In [26]:
num_bytes_ints = df_user_item.memory_usage(deep=True).sum()
num_bytes_format(num_bytes_ints)

'24.5281 Mb'

In [27]:
df_user_item = df_user_item.astype('string')
num_bytes_str = df_user_item.memory_usage(deep=True).sum()
num_bytes_format(num_bytes_str)

'191.5619 Mb'

In [28]:
df_user_item = df_user_item.astype('category')
num_bytes_cat = df_user_item.memory_usage(deep=True).sum()
num_bytes_format(num_bytes_cat)

'31.7843 Mb'

In [29]:
print(f'Экономия category относительно string: {(1 - num_bytes_cat / num_bytes_str) * 100:.2f}%')
print(f'Экономия ints относительно category: {(1 - num_bytes_ints / num_bytes_cat) * 100:.2f}%')

Экономия category относительно string: 83.41%
Экономия ints относительно category: 22.83%


In [30]:
df_user_item = df_user_item.astype(np.int64).astype('category')
num_bytes_int_cat = df_user_item.memory_usage(deep=True).sum()
num_bytes_format(num_bytes_int_cat)

'20.2944 Mb'

In [31]:
print(f'Экономия category on int64 относительно category on string: {(1 - num_bytes_int_cat / num_bytes_cat) * 100:.2f}%')

Экономия category on int64 относительно category on string: 36.15%


In [32]:
df_user_item['user_id'].cat.codes.dtype

dtype('int32')

# Integer DType

In [34]:
ratings = df['rating'].astype(np.float32).copy()

In [35]:
num_bytes_float = ratings.memory_usage(deep=True)
num_bytes_format(num_bytes_float)

'6.1321 Mb'

In [36]:
ratings = ratings.astype(pd.Int32Dtype())
num_bytes_Int32 = ratings.memory_usage(deep=True)
num_bytes_format(num_bytes_Int32)

'7.6651 Mb'

In [37]:
ratings = ratings.astype(pd.Int8Dtype())
num_bytes_Int8 = ratings.memory_usage(deep=True)
num_bytes_format(num_bytes_Int8)

'3.0661 Mb'

In [38]:
ratings

0          <NA>
1          <NA>
2          <NA>
3             5
4             5
           ... 
1532993    <NA>
1532994    <NA>
1532995       5
1532996       5
1532997       4
Name: rating, Length: 1532998, dtype: Int8

In [39]:
print(f'Экономия Int8DType относительно float64: {(1 - num_bytes_Int8 / num_bytes_float) * 100:.2f}%')

Экономия Int8DType относительно float64: 50.00%


# Sparse type
Sparse Type - тип данных в pandas для работы с разреженными данными.

Идея - храним только "известные" значения, остальное не храним и имитируем константой.

Сам тип создается на основе двух значений:
- dtype - тип сохраняемых значений
- fill_value - константа для пропущенных значений

In [40]:
sparse_type = pd.SparseDtype(np.float32, np.nan)
ratings = ratings.astype(np.float32).astype(sparse_type)

In [41]:
ratings

0          NaN
1          NaN
2          NaN
3          5.0
4          5.0
          ... 
1532993    NaN
1532994    NaN
1532995    5.0
1532996    5.0
1532997    4.0
Name: rating, Length: 1532998, dtype: Sparse[float32, nan]

In [42]:
num_bytes_sparse = ratings.memory_usage(deep=True)
num_bytes_format(num_bytes_sparse)

'2.2830 Mb'

In [43]:
print(f'Экономия sparse относительно Int8DType: {(1 - num_bytes_sparse / num_bytes_Int8) * 100:.2f}%')
print(f'Экономия sparse относительно float32: {(1 - num_bytes_sparse / num_bytes_float) * 100:.2f}%')

Экономия sparse относительно Int8DType: 25.54%
Экономия sparse относительно float32: 62.77%


In [44]:
ratings.sparse.density

0.18614179535785436