In [2]:
import pandas as pd

In [3]:
reviews = pd.read_csv('fnl_tempe_with_date.csv', 
                      names=['business_id', 'date', 'stars', 'user_id'])
reviews.head()

Unnamed: 0,business_id,date,stars,user_id
0,UBv8heCQR0RPnUQG0zkXIQ,2016-09-23,1,NhOc64RsrTT1Dls50yYW8g
1,hdgYnadxg0GANhWOJabr2g,2014-08-23,5,NhOc64RsrTT1Dls50yYW8g
2,VfX7rhtX03yNg56ISz8gVQ,2016-06-08,4,0pf5VuzE4_1pwj5NJHG5TQ
3,TT4XW2WMG0PNyUIFBIINuA,2015-10-18,1,wn5sc78bQn-FpuR2u0rcCQ
4,x2NuIlQgvvl3Wtq8ipZC6Q,2015-10-23,5,jjSR3JpsYKIgMUSR5zHRsQ


In [4]:
reviews['year'] = reviews['date'].apply(lambda d: d[:4])
years_business = reviews[['business_id', 'year']].drop_duplicates().groupby('business_id').count()
years_business = years_business[years_business['year']>1]
years_business.columns = ['years_opened']
reviews = reviews.join(years_business, on='business_id')
reviews.head()

Unnamed: 0,business_id,date,stars,user_id,year,years_opened
0,UBv8heCQR0RPnUQG0zkXIQ,2016-09-23,1,NhOc64RsrTT1Dls50yYW8g,2016,2.0
1,hdgYnadxg0GANhWOJabr2g,2014-08-23,5,NhOc64RsrTT1Dls50yYW8g,2014,8.0
2,VfX7rhtX03yNg56ISz8gVQ,2016-06-08,4,0pf5VuzE4_1pwj5NJHG5TQ,2016,12.0
3,TT4XW2WMG0PNyUIFBIINuA,2015-10-18,1,wn5sc78bQn-FpuR2u0rcCQ,2015,13.0
4,x2NuIlQgvvl3Wtq8ipZC6Q,2015-10-23,5,jjSR3JpsYKIgMUSR5zHRsQ,2015,8.0


In [5]:
last_year_business = reviews.groupby('business_id').agg({'year': 'max'})
last_year_business = last_year_business.rename(index=str, columns={'year': 'last_year'})
reviews = reviews.join(last_year_business, on='business_id')
reviews.head()

Unnamed: 0,business_id,date,stars,user_id,year,years_opened,last_year
0,UBv8heCQR0RPnUQG0zkXIQ,2016-09-23,1,NhOc64RsrTT1Dls50yYW8g,2016,2.0,2017
1,hdgYnadxg0GANhWOJabr2g,2014-08-23,5,NhOc64RsrTT1Dls50yYW8g,2014,8.0,2018
2,VfX7rhtX03yNg56ISz8gVQ,2016-06-08,4,0pf5VuzE4_1pwj5NJHG5TQ,2016,12.0,2018
3,TT4XW2WMG0PNyUIFBIINuA,2015-10-18,1,wn5sc78bQn-FpuR2u0rcCQ,2015,13.0,2018
4,x2NuIlQgvvl3Wtq8ipZC6Q,2015-10-23,5,jjSR3JpsYKIgMUSR5zHRsQ,2015,8.0,2018


In [6]:
avg_rtng = reviews.groupby(['business_id', 'year'])[['stars', 'last_year']].agg({'stars': ['mean', 'count'],
                                                                    'last_year': 'max'}).reset_index()
mask = (avg_rtng['stars']['count']>2)&(avg_rtng['stars']['mean']>=4)
avg_rtng.loc[:, 'suc'] = 0
avg_rtng.loc[mask, 'suc'] = 1
avg_rtng.head()

Unnamed: 0_level_0,business_id,year,last_year,stars,stars,suc
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,mean,count,Unnamed: 6_level_1
0,--9QQLMTbFzLJ_oT-ON3Xw,2013,2018,4.5,2,0
1,--9QQLMTbFzLJ_oT-ON3Xw,2014,2018,5.0,1,0
2,--9QQLMTbFzLJ_oT-ON3Xw,2015,2018,5.0,1,0
3,--9QQLMTbFzLJ_oT-ON3Xw,2016,2018,2.25,4,0
4,--9QQLMTbFzLJ_oT-ON3Xw,2017,2018,3.0,2,0


In [7]:
def transitions(sucs):
    nn, np, pn, pp = 0, 0, 0, 0
    for p, n in zip(sucs[:-1], sucs[1:]):
        if p==n and p==0: nn+=1
        if p==n and p==1: pp+=1
        if p!=n and p==0: np+=1
        if p!=n and p==1: pn+=1
    
    return {'nn': nn, 'np': np, 'pn': pn, 'pp': pp}

transitions = avg_rtng.groupby('business_id')['suc'].apply(transitions).to_frame().reset_index()

transitions.groupby('level_1').sum()

Unnamed: 0_level_0,suc
level_1,Unnamed: 1_level_1
nn,9752
np,2061
pn,1981
pp,2593


In [9]:
labels = avg_rtng[avg_rtng['last_year']['max']==avg_rtng['year']][['business_id', 'suc']]
labels.to_csv("business_labels.csv", index=False)