In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import os
import sys
import warnings
import fastparquet
from bokeh.plotting import figure, show, output_notebook
from datetime import datetime, date
import decimal
import time

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
output_notebook()

## Read data 

In [2]:
%%time

df = pd.read_parquet("joined.parquet.snappy")
# ddf = dd.from_pandas(df, npartitions=100)
df.head()

CPU times: user 3.93 s, sys: 2.8 s, total: 6.73 s
Wall time: 7.15 s


Unnamed: 0,r_id,b_id,u_id,r_stars,r_date,r_text,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_yelping_since,u_friends_count
0,4,6317,164431,5,2015-01-04 00:01:03,"Wow! Yummy, different, delicious. Our favo...",1,0,1,4.0,181,9,2014-01-17 19:20:57,1
1,9,914,154297,3,2016-03-30 22:46:33,This easter instead of going to Lopez Lake we ...,1,1,0,4.5,13,24,2015-10-27 22:53:34,1
2,11,3877,35412,5,2015-06-21 14:48:06,My experience with Shalimar was nothing but wo...,2,0,0,2.5,8,39,2009-01-27 21:20:30,7
3,18,12041,21756,4,2014-08-10 19:41:43,The hubby and I have been here on multiple occ...,1,0,0,4.0,398,74,2009-07-24 14:30:28,112
4,19,295,80128,5,2016-03-07 00:02:18,I go to blow bar to get my brows done by natal...,2,0,1,4.0,55,27,2012-02-26 05:18:05,8


## Extract date

In [3]:
review_date = df[["r_id","r_date","r_useful","r_funny", "r_cool"]]
review_date

Unnamed: 0,r_id,r_date,r_useful,r_funny,r_cool
0,4,2015-01-04 00:01:03,1,0,1
1,9,2016-03-30 22:46:33,1,1,0
2,11,2015-06-21 14:48:06,2,0,0
3,18,2014-08-10 19:41:43,1,0,0
4,19,2016-03-07 00:02:18,2,0,1
...,...,...,...,...,...
2576650,6990276,2014-12-17 21:45:20,1,2,1
2576651,6990277,2021-03-31 16:55:10,2,1,2
2576652,6990278,2019-12-30 03:56:30,1,0,0
2576653,6990279,2022-01-19 18:59:27,1,0,0


In [4]:
review_date['r_date'].max()

Timestamp('2022-01-19 19:21:03')

In [5]:
review_date['r_date'].min()

Timestamp('2005-03-02 04:53:42')

## Use value of the rate devided by the updated month 

In [20]:
def test(review_date):
    
    end_date = max(review_date.r_date)
    # print((end_date- review_date))
    # print(((end_date- review_date.r_date) / np.timedelta64(1, 'M')).apply(np.ceil))
    
    # Max date - current date
    review_date["r_month_diff"] = ((end_date- review_date.r_date) / np.timedelta64(1, 'M')+0.000000001).apply(np.ceil)
    
    # Rating / month diff
    review_date['r_useful_nor'] = review_date['r_useful']/review_date['r_month_diff']
    review_date['r_funny_nor'] = review_date['r_funny']/review_date['r_month_diff']
    review_date['r_cool_nor'] = review_date['r_cool']/review_date['r_month_diff']

test(review_date)
review_date

Unnamed: 0,r_id,r_date,r_useful,r_funny,r_cool,r_month_diff,r_useful_nor,r_funny_nor,r_cool_nor
0,4,2015-01-04 00:01:03,1,0,1,85.0,0.011765,0.000000,0.011765
1,9,2016-03-30 22:46:33,1,1,0,70.0,0.014286,0.014286,0.000000
2,11,2015-06-21 14:48:06,2,0,0,79.0,0.025316,0.000000,0.000000
3,18,2014-08-10 19:41:43,1,0,0,90.0,0.011111,0.000000,0.000000
4,19,2016-03-07 00:02:18,2,0,1,71.0,0.028169,0.000000,0.014085
...,...,...,...,...,...,...,...,...,...
2576650,6990276,2014-12-17 21:45:20,1,2,1,86.0,0.011628,0.023256,0.011628
2576651,6990277,2021-03-31 16:55:10,2,1,2,10.0,0.200000,0.100000,0.200000
2576652,6990278,2019-12-30 03:56:30,1,0,0,25.0,0.040000,0.000000,0.000000
2576653,6990279,2022-01-19 18:59:27,1,0,0,1.0,1.000000,0.000000,0.000000


In [7]:
final_data = review_date[["r_id","r_useful_nor","r_funny_nor", "r_cool_nor"]]
final_data

Unnamed: 0,r_id,r_useful_nor,r_funny_nor,r_cool_nor
0,4,0.011765,0.000000,0.011765
1,9,0.014286,0.014286,0.000000
2,11,0.025316,0.000000,0.000000
3,18,0.011111,0.000000,0.000000
4,19,0.028169,0.000000,0.014085
...,...,...,...,...
2576650,6990276,0.011628,0.023256,0.011628
2576651,6990277,0.200000,0.100000,0.200000
2576652,6990278,0.040000,0.000000,0.000000
2576653,6990279,1.000000,0.000000,0.000000


### Print summary

In [16]:
print("Mean.    SD       Median   Min      Max     Useful")
print(round(final_data['r_useful_nor'].mean(),6),round(final_data['r_useful_nor'].std(),6),round(final_data['r_useful_nor'].median(),6),round(min(final_data['r_useful_nor']),6),max(final_data['r_useful_nor']))

# Funny normalized
print("Mean.    SD     Median Min     Max          Funny")
print(round(final_data['r_funny_nor'].mean(),6),round(final_data['r_funny_nor'].std(),6),round(final_data['r_funny_nor'].median(),6),round(min(final_data['r_funny_nor']),6),max(final_data['r_funny_nor']))

# Cool normalized
print("Mean.    SD       Median   Min  Max         Cool")
print(round(final_data['r_cool_nor'].mean(),6),round(final_data['r_cool_nor'].std(),6),round(final_data['r_cool_nor'].median(),6),round(min(final_data['r_cool_nor']),6),max(final_data['r_cool_nor']))

Mean.    SD       Median   Min      Max     Useful
0.094083 0.519241 0.032258 -0.027027 111.0
Mean.    SD     Median Min     Max          Funny
0.021494 0.217363 0.0 -0.019231 82.0
Mean.    SD       Median   Min  Max         Cool
0.046275 0.448567 0.0 -0.019231 112.0


## Verfiy the result negative result

In [46]:
sum(((review_date ['r_date'].max() - review_date ['r_date']) / np.timedelta64(1, 'M')+0.0000001).apply(np.ceil) == 0)

0

In [47]:
print(final_data[final_data.r_useful_nor < 0])

           r_id  r_useful_nor  r_funny_nor  r_cool_nor
695010  1933675     -0.027027     0.000000    0.000000
728705  2008202     -0.013699     0.000000    0.000000
746119  2046382     -0.016393     0.000000    0.000000
810372  2201841     -0.019231    -0.019231   -0.019231


In [48]:
print(df[df.r_id == "1933675"])

           r_id   b_id    u_id  r_stars              r_date  \
695010  1933675  34127  829922        1 2018-12-23 01:35:31   

                                                   r_text  r_useful  r_funny  \
695010  This complaint has nothing to do with the qual...        -1        0   

        r_cool  b_stars  b_review_count  u_review_count     u_yelping_since  \
695010       0      2.0               6               2 2016-04-26 06:25:36   

        u_friends_count  
695010               96  
