In [None]:
# Required library
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Load dataset from UCI ML Repository
recipe_reviews_and_user_feedback = fetch_ucirepo(id=911)

# Get features and target
X = recipe_reviews_and_user_feedback.data.features
y = recipe_reviews_and_user_feedback.data.targets

# Combine features and target for easier inspection
df = pd.concat([X, y], axis=1)

# Display the first 5 rows
df.head()

In [None]:
# Required libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Dataset shape (rows, columns)
print("Dataset shape:", df.shape)

# Column names
print("\nColumns:\n", df.columns.tolist())

# Data types
print("\nData types:\n", df.dtypes)

# Missing value analysis
print("\nMissing values:\n", df.isnull().sum())

# Target variable distribution
print("\nTarget variable (stars) distribution:\n", df['stars'].value_counts().sort_index())

In [None]:
# Target variable
target = 'stars'

# Review all columns
print("All columns:\n", df.columns.tolist())

# Columns to drop: identifiers, text, timestamps, etc.
drop_columns = ['recipe_number', 'recipe_code', 'recipe_name', 'comment_id', 'user_id', 
                'user_name', 'created_at', 'text']  # dropping unstructured text etc.

# Create new DataFrame without dropped columns
df_cleaned = df.drop(columns=drop_columns)

# Check new column set
print("\nRemaining columns:\n", df_cleaned.columns.tolist())

In [None]:
# Separate features and target
X = df_cleaned.drop(columns=[target])
y = df_cleaned[target]

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Define transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # preserve class distribution
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# Apply preprocessing to training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Apply the same transformation to test data
X_test_preprocessed = preprocessor.transform(X_test)

In [None]:
# Distribution of the target variable (stars)
plt.figure(figsize=(8, 5))
sns.countplot(x=y)
plt.title('Distribution of Stars (Target)')
plt.xlabel('Star Rating')
plt.ylabel('Count')
plt.show()

# Relationship: thumbs_up vs. stars
plt.figure(figsize=(10, 6))
sns.boxplot(x=y, y=df_cleaned['thumbs_up'])
plt.title('Thumbs Up by Star Rating')
plt.xlabel('Stars')
plt.ylabel('Number of Thumbs Up')
plt.show()

# Relationship: reply_count vs. stars
plt.figure(figsize=(10, 6))
sns.boxplot(x=y, y=df_cleaned['reply_count'])
plt.title('Reply Count by Star Rating')
plt.xlabel('Stars')
plt.ylabel('Number of Replies')
plt.show()

In [None]:
print("Preprocessing and visualization steps completed successfully.")