In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Example data
data = pd.DataFrame({
    'asin': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'D', 'D', 'B'],
    'reviewText': ['good', 'bad', 'good', 'ok', 'good', 'excellent', 'ok', 'bad', 'excellent', 'good'],
    'rating': [5, 2, 4, 3, 5, 5, 3, 1, 5, 4],
    'Timestamp': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
})
data



Unnamed: 0,asin,reviewText,rating,Timestamp
0,A,good,5,1
1,B,bad,2,2
2,A,good,4,3
3,C,ok,3,4
4,B,good,5,5
5,A,excellent,5,6
6,C,ok,3,7
7,D,bad,1,8
8,D,excellent,5,9
9,B,good,4,10


In [7]:

TEST_RATIO = 0.2
RANDOM_STATE = 2002

# Define the Random Popularity Split function
def popularity_based_random_split(data, 
                                  item_column='asin', 
                                  review_column='reviewText', 
                                  rating_column='rating', 
                                  test_ratio=0.2, 
                                  seed=2002, 
                                  test_set_type='both'):
    data = data.dropna(subset=[item_column, review_column, rating_column])

    item_counts = data[item_column].value_counts()
    average_ratings = data.groupby(item_column)[rating_column].mean()
    popularity_score = (item_counts * 0.5) + (average_ratings * 0.5 * item_counts.max() / average_ratings.max())
    popularity_score = popularity_score.sort_values(ascending=False)

    top_20_percent_cutoff = int(len(popularity_score) * 0.2)
    popular_items = popularity_score.head(top_20_percent_cutoff).index

    train_data, test_data = train_test_split(data, test_size=test_ratio, random_state=seed)

    if test_set_type == 'popular':
        test_set = test_data[test_data[item_column].isin(popular_items)]
    elif test_set_type == 'unpopular':
        test_set = test_data[~test_data[item_column].isin(popular_items)]
    else:
        popular_test_set = test_data[test_data[item_column].isin(popular_items)]
        unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]
        test_set = pd.concat([popular_test_set, unpopular_test_set])

    return train_data, test_set

# Define the Sequential Popularity Split function
def popularity_based_sequential_split(data, 
                                      item_column='asin', 
                                      review_column='reviewText', 
                                      rating_column='rating', 
                                      time_column='Timestamp', 
                                      test_ratio=0.2, 
                                      test_set_type='both'):
    data = data.dropna(subset=[item_column, review_column, rating_column])

    item_counts = data[item_column].value_counts()
    average_ratings = data.groupby(item_column)[rating_column].mean()
    popularity_score = (item_counts * 0.5) + (average_ratings * 0.5 * item_counts.max() / average_ratings.max())
    popularity_score = popularity_score.sort_values(ascending=False)

    top_20_percent_cutoff = int(len(popularity_score) * 0.2)
    popular_items = popularity_score.head(top_20_percent_cutoff).index

    train_data, test_data = data[:int(len(data) * (1 - test_ratio))], data[int(len(data) * (1 - test_ratio)):]

    if test_set_type == 'popular':
        test_set = test_data[test_data[item_column].isin(popular_items)]
    elif test_set_type == 'unpopular':
        test_set = test_data[~test_data[item_column].isin(popular_items)]
    else:
        popular_test_set = test_data[test_data[item_column].isin(popular_items)]
        unpopular_test_set = test_data[~test_data[item_column].isin(popular_items)]
        test_set = pd.concat([popular_test_set, unpopular_test_set])

    return train_data, test_set
# print original data
print("Original Data")
print(data)
print("\n----------------------------------------")

# Perform Random Popularity Split
train_data_random, test_data_random = popularity_based_random_split(data)
print("Random Popularity Split - Train Data")
print(train_data_random)
print("\n----------------------------------------")
print("Random Popularity Split - Test Data")
print(test_data_random)
print("\n----------------------------------------")
# Perform Sequential Popularity Split
train_data_sequential, test_data_sequential = popularity_based_sequential_split(data)
print("\nSequential Popularity Split - Train Data")
print(train_data_sequential)
print("\n----------------------------------------")
print("Sequential Popularity Split - Test Data")
print(test_data_sequential)


Original Data
  asin reviewText  rating  Timestamp
0    A       good       5          1
1    B        bad       2          2
2    A       good       4          3
3    C         ok       3          4
4    B       good       5          5
5    A  excellent       5          6
6    C         ok       3          7
7    D        bad       1          8
8    D  excellent       5          9
9    B       good       4         10

----------------------------------------
Random Popularity Split - Train Data
  asin reviewText  rating  Timestamp
5    A  excellent       5          6
6    C         ok       3          7
7    D        bad       1          8
4    B       good       5          5
9    B       good       4         10
0    A       good       5          1
2    A       good       4          3
1    B        bad       2          2

----------------------------------------
Random Popularity Split - Test Data
  asin reviewText  rating  Timestamp
8    D  excellent       5          9
3    C         