In [1]:
import pandas as pd
reviews = pd.read_csv("yelp_academic_dataset_reviews_for_training.csv")
num_prev_reviews = reviews.groupby("business_id")['star'].count()
num_prev_reviews.head(10)

business_id
--9QQLMTbFzLJ_oT-ON3Xw     10
--ab39IjZR_xUf81WyTyHg     10
-0Sgh0QlUKVsWosCWJzGqQ     28
-2A9emZwBK8zYHPnAmM0hw      5
-2MFmbCTycnU4uPJ7jXfcA      2
-2X9U7v-Avoib-ki0y85bA     37
-4g68Hwm892_KPUuW5g1_Q     40
-4kOf3wcZp5bvxORgsW1gA     10
-82Z0wTA-nOCLUVD4XXIBA     30
-8QlV3b_9H4BAh6LgMIr1g    440
Name: star, dtype: int64

In [3]:
avg_review_count = reviews.groupby(["business_id", "year"])[['star']].count().reset_index()
avg_review_count = avg_review_count.groupby('business_id')['star'].mean()
avg_review_count.head(10)

business_id
--9QQLMTbFzLJ_oT-ON3Xw      2.000000
--ab39IjZR_xUf81WyTyHg      2.000000
-0Sgh0QlUKVsWosCWJzGqQ      4.666667
-2A9emZwBK8zYHPnAmM0hw      2.500000
-2MFmbCTycnU4uPJ7jXfcA      1.000000
-2X9U7v-Avoib-ki0y85bA      6.166667
-4g68Hwm892_KPUuW5g1_Q      5.714286
-4kOf3wcZp5bvxORgsW1gA      2.000000
-82Z0wTA-nOCLUVD4XXIBA      3.000000
-8QlV3b_9H4BAh6LgMIr1g    110.000000
Name: star, dtype: float64

In [4]:
avg_rating = reviews.groupby("business_id")['star'].mean()
avg_rating.head(10)

business_id
--9QQLMTbFzLJ_oT-ON3Xw    3.400000
--ab39IjZR_xUf81WyTyHg    4.200000
-0Sgh0QlUKVsWosCWJzGqQ    2.285714
-2A9emZwBK8zYHPnAmM0hw    2.200000
-2MFmbCTycnU4uPJ7jXfcA    5.000000
-2X9U7v-Avoib-ki0y85bA    4.594595
-4g68Hwm892_KPUuW5g1_Q    2.675000
-4kOf3wcZp5bvxORgsW1gA    3.500000
-82Z0wTA-nOCLUVD4XXIBA    3.166667
-8QlV3b_9H4BAh6LgMIr1g    4.231818
Name: star, dtype: float64

In [5]:
ratings_count = reviews.groupby(["business_id", "year"]).agg({'star': ['count', 'mean']})
successful_business_count = ratings_count.apply(lambda row: 1 if row[('star', 'count')]>2 and row[('star', 'mean')]>=4\
                                               else 0, 1)
successful_business_count = successful_business_count.to_frame().reset_index()
successful_business_count.columns = ["business_id", "year", "success"]
succ_count = successful_business_count.groupby("business_id")["success"].sum()
succ_count.head(10)

business_id
--9QQLMTbFzLJ_oT-ON3Xw    0
--ab39IjZR_xUf81WyTyHg    1
-0Sgh0QlUKVsWosCWJzGqQ    0
-2A9emZwBK8zYHPnAmM0hw    0
-2MFmbCTycnU4uPJ7jXfcA    0
-2X9U7v-Avoib-ki0y85bA    4
-4g68Hwm892_KPUuW5g1_Q    0
-4kOf3wcZp5bvxORgsW1gA    0
-82Z0wTA-nOCLUVD4XXIBA    1
-8QlV3b_9H4BAh6LgMIr1g    4
Name: success, dtype: int64

In [7]:
successful_last_year = successful_business_count.groupby("business_id").apply(lambda df: df.sort_values('year').iloc[-1]['success'])
successful_last_year.head(10)

business_id
--9QQLMTbFzLJ_oT-ON3Xw    0
--ab39IjZR_xUf81WyTyHg    0
-0Sgh0QlUKVsWosCWJzGqQ    0
-2A9emZwBK8zYHPnAmM0hw    0
-2MFmbCTycnU4uPJ7jXfcA    0
-2X9U7v-Avoib-ki0y85bA    1
-4g68Hwm892_KPUuW5g1_Q    0
-4kOf3wcZp5bvxORgsW1gA    0
-82Z0wTA-nOCLUVD4XXIBA    0
-8QlV3b_9H4BAh6LgMIr1g    1
dtype: int64

In [8]:
merged = pd.concat([num_prev_reviews, avg_review_count, avg_rating, succ_count, successful_last_year], 1)
merged.columns = "no_previous_reviews average_number_reviews_per_year average_rating no_years_success last_year_success".split()

merged.to_csv("reviews_features.csv")