In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os

# --- Configuration ---

# == For CF Test Data Generation ==
CF_RATINGS_CSV_PATH = 'cleaned_merged.csv'
CF_USER_COL = 'UserId'
CF_ITEM_COL = 'ProductId'
CF_RATING_COL = 'Score'
# These splits and random state MUST match those used in your train_cf.ipynb
CF_TEST_SPLIT_SIZE = 0.2
CF_VALIDATION_SPLIT_SIZE = 0.2 # This is validation split from the (1 - TEST_SPLIT_SIZE) part
CF_RANDOM_STATE = 42
CF_OUTPUT_CSV_PATH = 'cf_test_data.csv'

# == For CBF Product Sentiment Test Data Generation ==
CBF_INPUT_CSV_PATH = 'cleaned_merged.csv' 
CBF_PRODUCT_ID_COLUMN = 'ProductId'
CBF_REVIEW_SCORE_COLUMN = 'Score'
CBF_TEXT_COLUMN_FOR_PRODUCT = 'CleanedSummary'
CBF_TEST_SET_SIZE = 0.20  # 20% of unique products for the test set
CBF_RANDOM_STATE = 42     # For reproducibility
CBF_OUTPUT_CSV_PATH = 'cbf_product_sentiment_test.csv'

# == For App Data Generation (if you've added this cell) ==
APP_SOURCE_DATA_PATH = 'cleaned_merged.csv' # Use cleaned_merged.csv
APP_PRODUCT_TEXT_COLUMN = 'CleanedSummary' # This is key for app_data.csv

print("Configuration set.")
print(f"Current working directory: {os.getcwd()}")
print(f"Will try to load CF data from: {os.path.abspath(CF_RATINGS_CSV_PATH)}")
print(f"Will try to load CBF data from: {os.path.abspath(CBF_INPUT_CSV_PATH)}")

Configuration set.
Current working directory: c:\Users\mochi\OneDrive\Documents\MMU\Bachelors in Computer Science\FYP\code\code
Will try to load CF data from: c:\Users\mochi\OneDrive\Documents\MMU\Bachelors in Computer Science\FYP\code\code\cleaned_merged.csv
Will try to load CBF data from: c:\Users\mochi\OneDrive\Documents\MMU\Bachelors in Computer Science\FYP\code\code\cleaned_merged.csv


In [2]:
def generate_cf_test_data():
    print(f"\n--- Generating CF Test Data ({CF_OUTPUT_CSV_PATH}) ---")
    print(f"Loading CF ratings data from '{CF_RATINGS_CSV_PATH}'...")
    try:
        ratings_df = pd.read_csv(CF_RATINGS_CSV_PATH, usecols=[CF_USER_COL, CF_ITEM_COL, CF_RATING_COL])
        ratings_df.dropna(subset=[CF_USER_COL, CF_ITEM_COL, CF_RATING_COL], inplace=True)
        print(f"Loaded {len(ratings_df)} valid ratings.")
    except FileNotFoundError:
        print(f"Error: CF ratings file not found at '{CF_RATINGS_CSV_PATH}'. Cannot generate {CF_OUTPUT_CSV_PATH}.")
        return
    except Exception as e:
        print(f"Error loading CF ratings CSV: {e}")
        return

    if ratings_df.empty:
        print("Error: No data loaded from CF ratings file. Cannot generate test set.")
        return

    print(f"Splitting data: Test size={CF_TEST_SPLIT_SIZE}, Random state={CF_RANDOM_STATE}")

    # First split: Separate test set (this matches train_cf.ipynb logic)
    train_val_df, test_df = train_test_split(
        ratings_df,
        test_size=CF_TEST_SPLIT_SIZE,
        random_state=CF_RANDOM_STATE,
        # Stratification can be helpful if your ratings are imbalanced.
        # Consider adding stratify=ratings_df[CF_RATING_COL] if appropriate and matches train_cf.ipynb
    )

    if test_df.empty:
        print("Error: Generated CF test set is empty. Check data or split sizes.")
        return

    try:
        test_df.to_csv(CF_OUTPUT_CSV_PATH, index=False)
        print(f"Successfully created '{CF_OUTPUT_CSV_PATH}' with {len(test_df)} test samples.")
        print("Columns in output file:", list(test_df.columns))
        print("Sample of the CF test data:")
        print(test_df.head())
    except Exception as e:
        print(f"Error writing CF output CSV '{CF_OUTPUT_CSV_PATH}': {e}")

# Call the function
generate_cf_test_data()


--- Generating CF Test Data (cf_test_data.csv) ---
Loading CF ratings data from 'cleaned_merged.csv'...
Loaded 393560 valid ratings.
Splitting data: Test size=0.2, Random state=42
Successfully created 'cf_test_data.csv' with 78712 test samples.
Columns in output file: ['ProductId', 'UserId', 'Score']
Sample of the CF test data:
         ProductId          UserId  Score
114304  B000FKMNT6   AZ9YYDM4KCDI7      5
393085  B0006342ZU  A3RHY0HW2NTJXX      5
294769  B000261PI8  A1RHY943LNIPAO      5
226555  B005HG9ESG  A3RR2P5IS3DGPR      5
297501  B002NWIQQI  A382BBWTZMMH4Z      1


In [3]:
def assign_cbf_sentiment_label(avg_score):
    # Assigns a sentiment label based on the average score for CBF model.
    # 0: Negative (scores 1-2)
    # 1: Neutral (score 3)
    # 2: Positive (scores 4-5)
    if pd.isna(avg_score):
        return np.nan
    if avg_score <= 2:
        return 0
    elif avg_score == 3:
        return 1
    elif avg_score >= 4:
        return 2
    return np.nan

In [4]:
def generate_cbf_product_sentiment_test_data():
    print(f"\n--- Generating CBF Product Sentiment Test Data ({CBF_OUTPUT_CSV_PATH}) ---")
    print(f"Loading data from '{CBF_INPUT_CSV_PATH}'...")
    try:
        df = pd.read_csv(CBF_INPUT_CSV_PATH)
    except FileNotFoundError:
        print(f"Error: Input file '{CBF_INPUT_CSV_PATH}' not found. Cannot generate {CBF_OUTPUT_CSV_PATH}.")
        return
    except Exception as e:
        print(f"Error loading CSV for CBF data: {e}")
        return

    print("Processing product data for CBF test set...")

    required_cols_input = [CBF_PRODUCT_ID_COLUMN, CBF_REVIEW_SCORE_COLUMN, CBF_TEXT_COLUMN_FOR_PRODUCT]
    if not all(col in df.columns for col in required_cols_input):
        missing = [col for col in required_cols_input if col not in df.columns]
        print(f"Error: CBF Input CSV '{CBF_INPUT_CSV_PATH}' is missing required columns: {', '.join(missing)}")
        return

    # Convert scores to numeric
    df[CBF_REVIEW_SCORE_COLUMN] = pd.to_numeric(df[CBF_REVIEW_SCORE_COLUMN], errors='coerce')

    # Calculate average score per product
    product_avg_scores = df.groupby(CBF_PRODUCT_ID_COLUMN)[CBF_REVIEW_SCORE_COLUMN].mean().reset_index()
    product_avg_scores.rename(columns={CBF_REVIEW_SCORE_COLUMN: 'Score'}, inplace=True)

    # Assign sentiment labels
    product_avg_scores['True_Sentiment_Label'] = product_avg_scores['Score'].apply(assign_cbf_sentiment_label)
    product_avg_scores.dropna(subset=['True_Sentiment_Label'], inplace=True)
    product_avg_scores['True_Sentiment_Label'] = product_avg_scores['True_Sentiment_Label'].astype(int)

    # Get first non-null text per product
    product_texts = df.groupby(CBF_PRODUCT_ID_COLUMN)[CBF_TEXT_COLUMN_FOR_PRODUCT].first().reset_index()
    product_texts.rename(columns={CBF_TEXT_COLUMN_FOR_PRODUCT: 'CleanedSummary'}, inplace=True)
    product_texts.dropna(subset=['CleanedSummary'], inplace=True)
    product_texts = product_texts[product_texts['CleanedSummary'].str.strip() != '']

    # Merge data
    product_data_for_cbf = pd.merge(product_avg_scores, product_texts, on=CBF_PRODUCT_ID_COLUMN, how='inner')

    if product_data_for_cbf.empty:
        print("Error: No product data available for CBF after processing. Cannot create test set.")
        return

    unique_product_ids_for_cbf = product_data_for_cbf[CBF_PRODUCT_ID_COLUMN].unique()

    if len(unique_product_ids_for_cbf) < 2:
        print(f"Warning: Very few unique products for CBF ({len(unique_product_ids_for_cbf)}). Splitting might not be effective.")
        if len(unique_product_ids_for_cbf) == 1:
            test_product_ids_list_cbf = unique_product_ids_for_cbf
        else:
            print("Error: No products left for CBF test set.")
            return
    else:
        _, test_product_ids_list_cbf = train_test_split(
            unique_product_ids_for_cbf,
            test_size=CBF_TEST_SET_SIZE,
            random_state=CBF_RANDOM_STATE
        )

    test_product_ids_cbf = pd.Series(test_product_ids_list_cbf)
    test_df_cbf = product_data_for_cbf[product_data_for_cbf[CBF_PRODUCT_ID_COLUMN].isin(test_product_ids_cbf)]

    # Final output format
    output_df_cbf = test_df_cbf[[CBF_PRODUCT_ID_COLUMN, 'CleanedSummary', 'Score', 'True_Sentiment_Label']].copy()
    output_df_cbf.rename(columns={CBF_PRODUCT_ID_COLUMN: 'ProductId'}, inplace=True)
    output_df_cbf['ProductId'] = output_df_cbf['ProductId'].astype(str)

    if output_df_cbf.empty:
        print("Error: Generated CBF product sentiment test set is empty.")
        return

    try:
        output_df_cbf.to_csv(CBF_OUTPUT_CSV_PATH, index=False)
        print(f"Successfully created '{CBF_OUTPUT_CSV_PATH}' with {len(output_df_cbf)} test samples.")
        print("Columns in output file:", list(output_df_cbf.columns))
        print("Sample of the CBF product sentiment test data:")
        print(output_df_cbf.head())
        print("\nDistribution of True_Sentiment_Label in the CBF test set:")
        print(output_df_cbf['True_Sentiment_Label'].value_counts(normalize=True).sort_index())
    except Exception as e:
        print(f"Error writing CBF output CSV '{CBF_OUTPUT_CSV_PATH}': {e}")


In [5]:
print(generate_cbf_product_sentiment_test_data())


--- Generating CBF Product Sentiment Test Data (cbf_product_sentiment_test.csv) ---
Loading data from 'cleaned_merged.csv'...
Processing product data for CBF test set...
Successfully created 'cbf_product_sentiment_test.csv' with 11853 test samples.
Columns in output file: ['ProductId', 'CleanedSummary', 'Score', 'True_Sentiment_Label']
Sample of the CBF product sentiment test data:
     ProductId                         CleanedSummary  Score  \
4   9376674501                        terrific treats   5.00   
6   B00002NCJC                          thirty bucks?   4.50   
31  B00008434E             has given the gift of life   5.00   
34  B000084388  best dental protection for your dogs!   4.75   
35  B000084DVR            premium quality dog food!!!   5.00   

    True_Sentiment_Label  
4                      2  
6                      2  
31                     2  
34                     2  
35                     2  

Distribution of True_Sentiment_Label in the CBF test set:
True_Sen