## Exercise 01 - Basic operations


In [5]:
import pandas as pd


## Load views dataframe


In [6]:
views = pd.read_csv(
    '../data/feed-views.log',
    sep='	',
    header=None,
    names=['datetime', 'user'],
)

views['datetime'] = pd.to_datetime(views['datetime'])

views.head()


Unnamed: 0,datetime,user
0,2020-04-17 12:01:08.463179,artem
1,2020-04-17 12:01:23.743946,artem
2,2020-04-17 12:27:30.646665,artem
3,2020-04-17 12:35:44.884757,artem
4,2020-04-17 12:35:52.735016,artem


## Extract datetime components and daytime


In [7]:
views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second

bins = [0, 4, 7, 11, 17, 20, 24]
labels = [
    'night',
    'early morning',
    'morning',
    'afternoon',
    'early evening',
    'evening',
]

views['daytime'] = pd.cut(
    views['hour'],
    bins=bins,
    labels=labels,
    right=False,
    include_lowest=True,
)

views = views.set_index('user')

views.head()


Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon


## Counts and value_counts


In [8]:
count_per_column = views.count()
n_elements = views['datetime'].count()
daytime_counts = views['daytime'].value_counts()

count_per_column, n_elements, daytime_counts


(datetime    1076
 year        1076
 month       1076
 day         1076
 hour        1076
 minute      1076
 second      1076
 daytime     1076
 dtype: int64,
 np.int64(1076),
 daytime
 evening          509
 afternoon        252
 early evening    145
 night            129
 morning           36
 early morning      5
 Name: count, dtype: int64)

## Sorting and aggregations


In [9]:
views = views.sort_values(by=['hour', 'minute', 'second'])

views.head()


Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night
pavel,2020-05-12 00:01:38.444917,2020,5,12,0,1,38,night
pavel,2020-05-12 00:01:55.395042,2020,5,12,0,1,55,night


In [10]:
min_hour = views['hour'].min()
max_hour = views['hour'].max()
daytime_mode = views['daytime'].mode()

max_hour_night = views.loc[views['daytime'] == 'night', 'hour'].max()

min_hour_morning = views.loc[views['daytime'] == 'morning', 'hour'].min()
morning_min_rows = views[
    (views['daytime'] == 'morning') & (views['hour'] == min_hour_morning)
]
example_morning_user = morning_min_rows.index[0] if not morning_min_rows.empty else None

hour_mode = views['hour'].mode()
daytime_mode_again = views['daytime'].mode()

(min_hour, max_hour, daytime_mode, max_hour_night,
 min_hour_morning, example_morning_user, hour_mode, daytime_mode_again)


(np.int32(0),
 np.int32(23),
 0    evening
 Name: daytime, dtype: category
 Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening'],
 np.int32(3),
 np.int32(8),
 'alexander',
 0    22
 Name: hour, dtype: int32,
 0    evening
 Name: daytime, dtype: category
 Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening'])

In [11]:
earliest_hours = views.nsmallest(3, ['hour', 'minute', 'second'])[
    ['hour', 'minute', 'second', 'daytime']
].reset_index()
latest_hours = views.nlargest(3, ['hour', 'minute', 'second'])[
    ['hour', 'minute', 'second', 'daytime']
].reset_index()

earliest_hours, latest_hours


(        user  hour  minute  second daytime
 0  valentina     0       0      13   night
 1  valentina     0       1       5   night
 2      pavel     0       1      27   night,
         user  hour  minute  second  daytime
 0  alexander    23      59      38  evening
 1  valentina    23      58      56  evening
 2      pavel    23      54      54  evening)

In [12]:
stats = views.describe()
iqr = stats.loc['75%', 'hour'] - stats.loc['25%', 'hour']

stats, iqr


(                            datetime    year        month          day  \
 count                           1076  1076.0  1076.000000  1076.000000   
 mean   2020-05-10 09:00:41.211420672  2020.0     4.870818    13.552974   
 min       2020-04-17 12:01:08.463179  2020.0     4.000000     1.000000   
 25%       2020-05-10 01:13:49.857472  2020.0     5.000000    11.000000   
 50%    2020-05-11 22:48:35.302552832  2020.0     5.000000    13.000000   
 75%    2020-05-14 14:44:34.749530624  2020.0     5.000000    15.000000   
 max       2020-05-22 10:36:14.662600  2020.0     5.000000    30.000000   
 std                              NaN     0.0     0.335557     4.906567   
 
               hour       minute       second  
 count  1076.000000  1076.000000  1076.000000  
 mean     16.249071    29.629182    29.500929  
 min       0.000000     0.000000     0.000000  
 25%      13.000000    14.000000    14.000000  
 50%      19.000000    29.000000    30.000000  
 75%      22.000000    46.000000   