### LSE Data Analytics Online Career Accelerator 

# DA301:  Advanced Analytics for Organisational Impact

## 1. Load and explore the data

In [None]:
# Import all the necessary packages

import warnings
import os
import string

from wordcloud import WordCloud
from nltk.probability import FreqDist
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from helper_functions import *
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from aquarel import load_theme
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import uniform

# Import specific functions from libraries
from IPython.core.magic import register_cell_magic

# Ignore warnings to keep the output clean
# warnings.filterwarnings('ignore')

# Get the IPython instance to interact with the IPython environment
ipython = get_ipython()

# Define a custom cell magic command to skip data cleaning steps in subsequent runs
@register_cell_magic
def skip_if(line, cell):
    # Evaluate the condition provided in the line argument
    if eval(line):
        return  # Skip the cell if the condition is True
    # Otherwise, execute the cell
    ipython.run_cell(cell)

# If this variable is set to True. Skips cleaning during reruns of code if not required
no_cleaning = False
no_regression = False
no_decision_tree = False
no_k_means = False
no_NLP = False
no_sentiment_calculation = False  # set to False for recalculation

# Set the display options to use the full width of the screen
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

theme = load_theme("scientific")
theme.apply()

In [None]:
%%skip_if no_cleaning
# Load the CSV file(s) as reviews.
# If pickle file exists, read it; otherwise, read the CSV and convert to pickle
# This ensures faster loading from the second run and avoids reading the CSV file when resetting the kernel
if not os.path.exists("turtle_reviews.pickle"):
    # Read the CSV file and convert it to a pickle file
    reviews = pd.read_csv('turtle_reviews.csv')
    reviews.to_pickle('turtle_reviews.pickle')
    # Read the newly created pickle file
    reviews = pd.read_pickle('turtle_reviews.pickle')
else:
    # Read the existing pickle file
    reviews = pd.read_pickle('turtle_reviews.pickle')

# # View the DataFrame.
# print("Head \n", reviews.head())
# print("Types\n", reviews.dtypes)
# print("Shape\n", reviews.shape)


In [None]:
%%skip_if no_cleaning

# Determine whether there are missing values in the DataFrame
reviews_na = reviews[reviews.isna().any(axis=1)]

# Drop rows with missing values from the DataFrame
reviews.drop(reviews_na.index, axis=0, inplace=True)

# Identify duplicate rows in the DataFrame
reviews_duplicates = reviews[reviews.duplicated(keep=False)]

# Drop duplicate rows from the DataFrame, keeping the first occurrence
reviews.drop_duplicates(keep='first', inplace=True)

# Reset the index of the DataFrame after dropping rows
reviews.reset_index(drop=True, inplace=True)

In [None]:
%%skip_if no_cleaning
reviews['gender']= reviews['gender'].astype('category')
reviews['product']= reviews['product'].astype('category')
reviews['education']= reviews['education'].astype('category')


In [None]:
%%skip_if no_cleaning
# Basic descriptive statistics.
reviews.describe()


In [None]:
%%skip_if no_cleaning
# Drop unnecessary columns.
reviews = reviews[['gender', 'age', 'remuneration (k£)', 'spending_score (1-100)',
         'loyalty_points', 'education', 'product',
         'review','summary']]


In [None]:
%%skip_if no_cleaning
# Rename the column headers.
reviews = reviews.rename(columns = {'remuneration (k£)':'remuneration', 'spending_score (1-100)': 'spending_score'})
reviews['remuneration'] = reviews['remuneration'].apply(lambda x: x*1000)



In [None]:
%%skip_if no_cleaning
# Create a CSV file as output.
reviews.to_csv('reviews_cleaned.csv', index=False)

In [None]:
# Import new CSV file with Pandas.
reviews_cleaned = pd.read_csv('reviews_cleaned.csv')
print("Types\n", reviews_cleaned.dtypes)

In [None]:
# Load the dataset
reviews_cleaned = pd.read_csv('reviews_cleaned.csv')

# Split the dataset into training and testing sets with 80% of the data in the training set
reviews_train, reviews_test = train_test_split(reviews_cleaned, test_size=0.3, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", reviews_train.shape)
print("Testing set shape:", reviews_test.shape)

## Linear regression

In [None]:
%%skip_if no_regression
# Create model and print summary of metrics.
loyalty_models = get_regression_results_with_plot(
    indep_vars =['spending_score', 'remuneration', 'age'],
    dep_var ='loyalty_points',
    data=reviews_cleaned,
    outliers_removed = False
)


In [None]:
%%skip_if no_regression
check_regression_assumptions(model_input= loyalty_models['spending_score'], X = reviews_cleaned[['spending_score']])

In [None]:
%%skip_if no_regression
check_regression_assumptions(model_input= loyalty_models['remuneration'], X = reviews_cleaned[['remuneration']])

In [None]:
%%skip_if no_regression
# Create model and print summary of metrics.

reviews_cleaned['log_loyalty_points'] =np.log1p(reviews_cleaned['loyalty_points'])
reviews_cleaned['log_spending_score'] =np.log1p(reviews_cleaned['spending_score'])
reviews_cleaned['log_remuneration'] =np.log1p(reviews_cleaned['remuneration'])

loyalty_models_weighted = get_regression_results_with_plot(
    indep_vars =['log_spending_score', 'log_remuneration'],
    dep_var ='log_loyalty_points',
    data=reviews_cleaned,
    outliers_removed = False,
    weighted= False
)


In [None]:
%%skip_if no_regression
check_regression_assumptions(model_input= loyalty_models_weighted['log_spending_score'], X = reviews_cleaned[['log_spending_score']])

In [None]:
%%skip_if no_regression
check_regression_assumptions(model_input= loyalty_models_weighted['log_remuneration'], X = reviews_cleaned[['log_remuneration']])

In [None]:
# # Extract the estimated parameters.
# # Print R-squared value of the train data.
# print("R-squared value: ", loyalty_models['spending_score'].rsquared)
#
# # Print the intercept value.
# print("Intercept value: ", loyalty_models['spending_score'].params[0])
#
# # Print the coefficient value.
# print("Coefficient value: ", loyalty_models['spending_score'].params[1])
#
# # Extract the standard errors.
# print("Standard Errors: ", loyalty_models['spending_score'].bse)
#
# # Extract the predicted values.
# print("Predicted Values: ", loyalty_models['spending_score'].predict(reviews_train))

In [None]:
%%skip_if no_regression
loyalty_models['spending_score'].summary()

In [None]:
# loyalty_models['remuneration'].summary()

In [None]:
# loyalty_models['age'].summary()

In [None]:
%%skip_if no_regression
loyalty_multi_var_model = multivariate_regression_3d_plot(
    indep_vars =['spending_score', 'remuneration'],
    dep_var ='loyalty_points',
    data=reviews_cleaned,
    show_plot = True,
    outliers_removed = False,
    elevation=10,
    azimuth=-50,
)

In [None]:
%%skip_if no_regression
check_regression_assumptions(model_input= loyalty_multi_var_model, X = reviews_cleaned[['spending_score', 'remuneration']], multi= True)

# Decision Trees

In [None]:
%%skip_if no_decision_tree
# Convert categorical variables to dummy variables
reviews_dt = reviews_cleaned.drop(columns=['review', 'summary', 'product', 'log_remuneration', 'log_loyalty_points', 'log_spending_score', 'log_remuneration'])
metrics = evaluate_decision_tree_regressor(reviews_dt, use_columns=['age', 'remuneration', 'spending_score', 'gender', 'education'], depth=3)


In [None]:
%%skip_if no_decision_tree

progress_check = []

for iter_tree in range(1, 15):
    metrics = evaluate_decision_tree_regressor(
        reviews_dt,
        use_columns=['age', 'remuneration', 'spending_score', 'gender', 'education'],
        depth=iter_tree,
        showplot=False,
        showstats=False
    )
    progress_check.append([iter_tree, metrics['mean_squared_error'], metrics['r2_score']])

# Convert the progress_check list to a DataFrame
df = pd.DataFrame(progress_check, columns=['Depth', 'Mean Squared Error', 'R-squared'])

# Set the 'Depth' column as the index
df.set_index('Depth', inplace=True)

# Display the DataFrame
print(df)

# Extracting data for plotting changes
depths = [item[0] for item in progress_check[1:]]  # Start from the second element
mse_changes = [abs(((progress_check[i][1] - progress_check[i-1][1]) / progress_check[i-1][1])) * 100 for i in range(1, len(progress_check))]
r2_changes = [((progress_check[i][2] - progress_check[i-1][2]) / progress_check[i-1][2]) * 100 for i in range(1, len(progress_check))]

# Plotting
plt.figure(figsize=(14, 6))

# Plot MSE Changes
plt.subplot(1, 2, 1)
plt.plot(depths, mse_changes, marker='o', color='blue')
plt.title('Percentage Change in MSE vs. Tree Depth')
plt.xlabel('Tree Depth')
plt.ylabel('Percentage Change in MSE')
plt.grid(True)

# Add data labels for MSE changes
for i, change in enumerate(mse_changes):
    plt.text(depths[i], change, f'{change:.2f}%', fontsize=9, ha='center', va='bottom')

# Plot R2 Changes
plt.subplot(1, 2, 2)
plt.plot(depths, r2_changes, marker='o', color='orange')
plt.title('Percentage Change in R-squared vs. Tree Depth')
plt.xlabel('Tree Depth')
plt.ylabel('Percentage Change in R-squared')
plt.grid(True)

# Add data labels for R2 changes
for i, change in enumerate(r2_changes):
    plt.text(depths[i], change, f'{change:.2f}%', fontsize=9, ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
%%skip_if no_decision_tree

metrics = evaluate_random_forest_regressor(
    reviews_dt,
    use_columns=['age', 'remuneration', 'spending_score', 'gender', 'education'],
    showstats=True,
    max_depth=3
)


In [None]:
%%skip_if no_decision_tree

data = reviews_dt

# Define features and target variable
X = data[['age', 'remuneration', 'spending_score', 'gender', 'education']].copy()
y = data['loyalty_points'].copy()

# Encode categorical variables
categorical_columns = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)


In [None]:

%%skip_if no_decision_tree

# Define the parameter distributions to sample from
param_dist = {
    'max_depth': randint(1, 7),  # randint from scipy.stats
    'min_samples_split': randint(30, 1000),
    'min_samples_leaf': randint(30, 100),
    'ccp_alpha': uniform(0.01, 0.1),          # float between 0.01 and 0.11 (0.01 + 0.1)
    'min_impurity_decrease': uniform(0.1, 0.2)
}

# Create a Decision Tree Regressor
dtree_reg = DecisionTreeRegressor(random_state=42)

# Use RandomizedSearchCV to search for the best parameters
random_search = RandomizedSearchCV(
    dtree_reg,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    random_state=42
)

# Fit the random search to the data
random_search.fit(X_train, y_train)
results_random_search = pd.DataFrame(random_search.cv_results_)

# Get the best estimator from the grid search
best_dtree_reg = random_search.best_estimator_

# Predict on the test set using the best estimator
y_pred = best_dtree_reg.predict(X_test)

# Calculate Mean Squared Error and Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Retrieve the best parameters and best score from the grid search
best_params = random_search.best_params_
best_score_grid = random_search.best_score_

# Retrieve the best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print(f"Best Parameters (Random Search): {best_params_random}")
print(f"Best Score (Random Search): {best_score_random}")
print(f"Test RMSE: {rmse}")

In [None]:
# Calculate basic stats of y_test
y_test_mean = np.mean(y_test)
y_test_min = np.min(y_test)
y_test_max = np.max(y_test)
y_test_std = np.std(y_test)

# Calculate Normalized RMSE (by range and mean)
nrmse_range = rmse / (y_test_max - y_test_min)
nrmse_mean = rmse / y_test_mean

print(f"Best Parameters (Random Search): {best_params_random}")
print(f"Best Score (Random Search): {best_score_random}")
print(f"Test RMSE: {rmse:.4f}")

print("\n--- Target Variable Statistics (Test Set) ---")
print(f"Mean: {y_test_mean:.4f}")
print(f"Min: {y_test_min:.4f}")
print(f"Max: {y_test_max:.4f}")
print(f"Std Dev: {y_test_std:.4f}")

print("\n--- Normalized RMSE ---")
print(f"NRMSE (range): {nrmse_range:.4f}")
print(f"NRMSE (mean): {nrmse_mean:.4f}")

In [None]:
# Calculate residuals on test set
residuals = y_test - y_pred

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot: Predicted vs Residuals
axes[0].scatter(y_pred, residuals, alpha=0.6)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted values')
axes[0].set_ylabel('Residuals (Actual - Predicted)')
axes[0].set_title('Residual Plot')

# Histogram of residuals to check their distribution
sns.histplot(residuals, kde=True, ax=axes[1])
axes[1].set_title('Residuals Distribution')
axes[1].set_xlabel('Residual')

plt.tight_layout()
plt.show()

In [None]:
best_params_random = {
    'max_depth': 6,
    'min_samples_split': 82,
    'min_samples_leaf': 37,
    'ccp_alpha': 0.01388347344294232,
    'min_impurity_decrease': 0.2074164854393311
}
# Train the model
tree_model_random = DecisionTreeRegressor(**best_params_random)


# Try different seeds
evaluate_model_with_seed(tree_model_random, X, y, seed=123)
evaluate_model_with_seed(tree_model_random, X, y, seed=2024)
evaluate_model_with_seed(tree_model_random, X, y, seed=99)



In [None]:
tree_model_random.fit(X, y)

plt.figure(figsize=(40, 20))  # Large figure size for high resolution
plot_tree(tree_model_random,
          filled=True,
          feature_names=X.columns,
          fontsize=8)

# Save the plot as a high-quality PNG file
file_path = 'decision_tree_random_optimization.png'
plt.savefig(file_path, dpi=300, bbox_inches='tight')

plt.close()
file_path

In [None]:
%%skip_if no_decision_tree

# Define the function to optimize using cross-validation
def dtree_cv(max_depth, min_samples_split, min_samples_leaf, ccp_alpha,min_impurity_decrease):
    # Define the model with the parameters to be optimized
    estimator = DecisionTreeRegressor(
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=2,
        ccp_alpha = float(ccp_alpha),
        min_impurity_decrease = float(min_impurity_decrease)
    )
    cval = cross_val_score(estimator, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    return cval.mean()  # The optimizer tries to maximize the function

# Define the parameter bounds
param_bounds = {
    'max_depth': (1, 7),
    'min_samples_split': (30, 100),
    'min_samples_leaf': (30, 100),
    'ccp_alpha': (0.01, 0.1),
    'min_impurity_decrease': (0.1, 0.3)
}

# Perform Bayesian Optimization
optimizer = BayesianOptimization(
    f=dtree_cv,
    pbounds=param_bounds,
    random_state=1,
)

optimizer.maximize(n_iter=25, init_points=5)  # Bayesian optimization

# Extract the best parameters
best_params_bayes = optimizer.max['params']
best_params_bayes['max_depth'] = int(best_params_bayes['max_depth'])
best_params_bayes['min_samples_split'] = int(best_params_bayes['min_samples_split'])
best_params_bayes['min_samples_leaf'] = int(best_params_bayes['min_samples_leaf'])

# Fit the best model on the training data
best_dtree_reg = DecisionTreeRegressor(**best_params_bayes, random_state=42)
best_dtree_reg.fit(X_train, y_train)

# Predict on the test set using the best model
y_pred = best_dtree_reg.predict(X_test)

# Calculate Test RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

best_score_bayes = optimizer.max['target']

print(f"Best Parameters (Bayesian Optimization): {best_params_bayes}")
print(f"Best Score (Bayesian Optimization): {best_score_bayes}")
print(f"Test RMSE: {rmse}")

In [None]:
# Calculate basic stats of y_test
y_test_mean = np.mean(y_test)
y_test_min = np.min(y_test)
y_test_max = np.max(y_test)
y_test_std = np.std(y_test)

# Calculate Normalized RMSE (by range and mean)
nrmse_range = rmse / (y_test_max - y_test_min)
nrmse_mean = rmse / y_test_mean

print(f"Best Parameters (Bayesian Optimization): {best_params_bayes}")
print(f"Best Score (Bayesian Optimization): {best_score_bayes}")
print(f"Test RMSE: {rmse:.4f}")

print("\n--- Target Variable Statistics (Test Set) ---")
print(f"Mean: {y_test_mean:.4f}")
print(f"Min: {y_test_min:.4f}")
print(f"Max: {y_test_max:.4f}")
print(f"Std Dev: {y_test_std:.4f}")

print("\n--- Normalized RMSE ---")
print(f"NRMSE (range): {nrmse_range:.4f}")
print(f"NRMSE (mean): {nrmse_mean:.4f}")

In [None]:
# Calculate residuals on test set
residuals = y_test - y_pred

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot: Predicted vs Residuals
axes[0].scatter(y_pred, residuals, alpha=0.6)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted values')
axes[0].set_ylabel('Residuals (Actual - Predicted)')
axes[0].set_title('Residual Plot')

# Histogram of residuals to check their distribution
sns.histplot(residuals, kde=True, ax=axes[1])
axes[1].set_title('Residuals Distribution')
axes[1].set_xlabel('Residual')

plt.tight_layout()
plt.show()

In [None]:
best_params_bayesian = {
    'max_depth': 7,
    'min_samples_split': 66,
    'min_samples_leaf': 31,
    'ccp_alpha': 0.01,
    'min_impurity_decrease': 0.3
}

tree_model_bayesian = DecisionTreeRegressor(**best_params_bayesian)

# Try different seeds
evaluate_model_with_seed(tree_model_bayesian, X, y, seed=123)
evaluate_model_with_seed(tree_model_bayesian, X, y, seed=2024)
evaluate_model_with_seed(tree_model_bayesian, X, y, seed=99)

In [None]:
tree_model_bayesian.fit(X, y)

# Attempt to plot and save the decision tree again
plt.figure(figsize=(40, 20))  # Large figure size for high resolution
plot_tree(tree_model_bayesian,
          filled=True,
          feature_names=X.columns,
          fontsize=8)

# Save the plot as a high-quality PNG file
file_path = 'decision_tree_bayesian_optimization.png'
plt.savefig(file_path, dpi=300, bbox_inches='tight')

plt.close()
file_path

# K-Means

In [None]:
%%skip_if no_k_means

# Select only the features to use
reviews_k_map = reviews_cleaned[['remuneration', 'spending_score']].copy()

In [None]:
%%skip_if no_k_means

# Create a scatterplot with Seaborn.
# Create a scatter plot of remuneration versus spending_score
plt.figure(figsize=(10, 6))
sns.scatterplot(data=reviews_k_map, x='remuneration', y='spending_score')
plt.title('Scatter Plot of Remuneration vs Spending Score')
plt.xlabel('Remuneration')
plt.ylabel('Spending Score')
plt.grid(True)
plt.show()


In [None]:
%%skip_if no_k_means
#Create a pair plot with Seaborn

sns.pairplot(reviews_k_map, height=3.5, aspect=1.2)
plt.suptitle('Pair Plot of Remuneration and Spending Score', y=1.02)
plt.show()

In [None]:
%%skip_if no_k_means

# Run KMeans clustering
kmeans = KMeans(n_clusters=8, random_state=42)
reviews_k_map['cluster'] = kmeans.fit_predict(reviews_k_map[['remuneration', 'spending_score']])

# Plot the clustering result
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=reviews_k_map,
    x='remuneration',
    y='spending_score',
    hue='cluster',
    palette='viridis',
    s=50
)

# Plot cluster centroids
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X', label='Centroids')

# Plot aesthetics
plt.title('K-means Clustering (k=8)')
plt.xlabel('Remuneration')
plt.ylabel('Spending Score')
plt.legend(title='Cluster', loc='best')
plt.tight_layout()
plt.show()

In [None]:
%%skip_if no_k_means

# Standardize the data
features = reviews_cleaned[['remuneration', 'spending_score']].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Fit KMeans
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
centers_original = scaler.inverse_transform(kmeans.cluster_centers_)

# Add results back for plotting
features['cluster'] = clusters
centers = kmeans.cluster_centers_

# Plot in scaled feature space
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x='remuneration',
    y='spending_score',
    data = features,
    hue=clusters,
    palette='viridis',
    s=50
)
plt.scatter(centers_original[:, 0], centers_original[:, 1], c='red', s=200, marker='X', label='Centroids')
plt.title('K-means Clustering (k=8) on Standardized Features')
plt.xlabel('Standardized Remuneration')
plt.ylabel('Standardized Spending Score')
plt.legend(title='Cluster', loc='best')
plt.tight_layout()
plt.show()


In [None]:
%%skip_if no_k_means

# Calculate inertia for different values of k
inertia = []
K = range(1, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Calculate the raw change in inertia
inertia_change_raw = np.diff(inertia)

# Plot the Elbow Method graph and the raw change in inertia
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
# Elbow Method plot
ax1.plot(K, inertia, 'bo-')
ax1.set_xlabel('Number of Clusters')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method For Optimal Number of Clusters')
ax1.set_xticks(K)
ax1.grid(True)

# Raw change in inertia plot
K_change = range(2, 10)  # Since we start calculating change from the second value
ax2.plot(K_change, inertia_change_raw, 'ro-')

# Label each point with its value
for i, txt in enumerate(inertia_change_raw):
    ax2.annotate(f"{txt:.2f}", (K_change[i], inertia_change_raw[i]), textcoords="offset points", xytext=(0,10), ha='center')

ax2.set_xlabel('Number of Clusters')
ax2.set_ylabel('Raw Change in Inertia')
ax2.set_title('Raw Change in Inertia')
ax2.set_xticks(K_change)
ax2.grid(True)

plt.tight_layout()
plt.show()

In [None]:
%%skip_if no_k_means

# Determine the number of clusters: Silhouette method.
K = range(2, 10)  # We start from 2 because silhouette score is not defined for a single cluster
silhouette_scores = []
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot the Silhouette Method graph
plt.figure(figsize=(10, 6))
plt.plot(K, silhouette_scores, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method For Optimal Number of Clusters')
plt.xticks(K)
plt.grid(True)
plt.show()

In [None]:
%%skip_if no_k_means

# Create a 3x2 grid of subplots
fig, axs = plt.subplots(3, 2, figsize=(18, 16))

# Flatten the array of axes for easy iteration
axs = axs.ravel()

# Plot for different values of k
for k, ax in zip(range(3, 9), axs):
    # Fit KMeans
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    centers_original = scaler.inverse_transform(kmeans.cluster_centers_)

    # Add results back for plotting
    features['cluster'] = clusters

    # Count the number of points in each cluster
    cluster_counts = features['cluster'].value_counts().sort_index()

    # Define a color palette with enough distinct colors
    palette = sns.color_palette("viridis", k)

    # Plot in scaled feature space
    sns.scatterplot(
        x='remuneration',
        y='spending_score',
        data=features,
        hue='cluster',
        palette=palette,
        s=50,
        ax=ax
    )

    # Plot centroids without labels
    ax.scatter(centers_original[:, 0], centers_original[:, 1], c='red', s=200, marker='X')

    # Custom legend labels with cluster number and counts
    handles, labels = ax.get_legend_handles_labels()
    custom_labels = [f"Cluster {i}: {count} points" for i, count in cluster_counts.items()]
    ax.legend(handles=handles[:len(custom_labels)], labels=custom_labels, title='Clusters',
              loc='center left', bbox_to_anchor=(1, 0.5))

    ax.set_title(f'K-means Clustering (k={k})')
    ax.set_xlabel('Remuneration')
    ax.set_ylabel('Spending Score')

# Adjust layout
plt.tight_layout()
plt.show()

# NLP

In [None]:
%%skip_if no_NLP

# Fill empty entries in the 'review' and 'summary' columns with an empty string
reviews_nlp = reviews_cleaned[['review', 'summary']].fillna('')


In [None]:
%%skip_if no_NLP

# Convert text to lower case
reviews_nlp['review'] = reviews_nlp['review'].str.lower()
reviews_nlp['summary'] = reviews_nlp['summary'].str.lower()

In [None]:
%%skip_if no_NLP

reviews_nlp_copy = reviews_nlp.copy()

In [None]:
%%skip_if no_NLP

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Remove punctuation
reviews_nlp_copy['review'] = reviews_nlp_copy['review'].apply(remove_punctuation)
reviews_nlp_copy['summary'] = reviews_nlp_copy['summary'].apply(remove_punctuation)

In [None]:
%%skip_if no_NLP

# Tokenize the text
reviews_nlp_copy['review_tokens'] = reviews_nlp_copy['review'].apply(word_tokenize)
reviews_nlp_copy['summary_tokens'] = reviews_nlp_copy['summary'].apply(word_tokenize)


In [None]:
%%skip_if no_NLP

# Function to generate word cloud
def generate_wordcloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Generate word clouds
generate_wordcloud(reviews_nlp_copy['review_tokens'].sum())
generate_wordcloud(reviews_nlp_copy['summary_tokens'].sum())

In [None]:
%%skip_if no_NLP

# Frequency distribution for reviews
freq_dist_review = FreqDist(reviews_nlp_copy['review_tokens'].sum())
freq_dist_review.plot(30, cumulative=False)
plt.show()


In [None]:
%%skip_if no_NLP

freq_dist_summary = FreqDist(reviews_nlp_copy['summary_tokens'].sum())
freq_dist_review.plot(30, cumulative=False)
plt.show()

In [None]:
%%skip_if no_NLP

# Remove alphanumeric characters from tokens
filtered_review_tokens = reviews_nlp_copy['review_tokens'].apply(lambda x: [word for word in x if word.isalpha()])
filtered_summary_tokens = reviews_nlp_copy['summary_tokens'].apply(lambda x: [word for word in x if word.isalpha()])


reviews_nlp_copy['review_tokens'] = filtered_review_tokens
reviews_nlp_copy['summary_tokens'] = filtered_summary_tokens

# Remove stopwords
stop_words = set(stopwords.words('english'))

filtered_review_tokens = reviews_nlp_copy['review_tokens'].apply(lambda x: [word for word in x if word not in stop_words])
filtered_summary_tokens = reviews_nlp_copy['summary_tokens'].apply(lambda x: [word for word in x if word not in stop_words])

reviews_nlp_copy['review_tokens'] = filtered_review_tokens
reviews_nlp_copy['summary_tokens'] = filtered_summary_tokens

In [None]:
%%skip_if no_NLP

# Generate word clouds without stopwords
generate_wordcloud(filtered_review_tokens.sum())
generate_wordcloud(filtered_summary_tokens.sum())

In [None]:
%%skip_if no_NLP

# Flatten the token lists to find the most common words
flat_review_tokens = [word for sublist in reviews_nlp_copy['review_tokens'] for word in sublist]
flat_summary_tokens = [word for sublist in reviews_nlp_copy['summary_tokens'] for word in sublist]

# Most common words
freq_dist_review = FreqDist(flat_review_tokens)
freq_dist_summary = FreqDist(flat_summary_tokens)

most_common_review_words = freq_dist_review.most_common(15)
most_common_summary_words = freq_dist_summary.most_common(15)

print("Most common words in reviews:", most_common_review_words)
print("Most common words in summaries:", most_common_summary_words)

In [None]:
%%skip_if no_NLP

csv_path = "reviews_nlp_with_sentiment.csv"

if no_sentiment_calculation and os.path.exists(csv_path):
    print("Loading precomputed sentiment data...")
    reviews_nlp = pd.read_csv(csv_path)
else:
    print("Computing sentiment features...")

    # === TextBlob ===
    reviews_nlp['textblob_review_polarity'] = reviews_nlp['review'].apply(textblob_polarity)
    reviews_nlp['textblob_summary_polarity'] = reviews_nlp['summary'].apply(textblob_polarity)

    # === VADER ===
    reviews_nlp['vader_review_polarity'] = reviews_nlp['review'].apply(vader_polarity)
    reviews_nlp['vader_summary_polarity'] = reviews_nlp['summary'].apply(vader_polarity)

    # === Afinn (raw and scaled) ===
    reviews_nlp['afinn_review_polarity'] = reviews_nlp['review'].apply(afinn_polarity)
    reviews_nlp['afinn_review_polarity'] = reviews_nlp.apply(
        lambda row: scale_afinn_by_length(row['review'], row['afinn_review_polarity']),
        axis=1
    )
    reviews_nlp['afinn_summary_polarity'] = reviews_nlp['summary'].apply(afinn_polarity)
    reviews_nlp['afinn_summary_polarity'] = reviews_nlp.apply(
        lambda row: scale_afinn_by_length(row['summary'], row['afinn_summary_polarity']),
        axis=1
    )

    # === RoBERTa (batch) ===
    reviews_nlp['roberta_review_polarity'] = roberta_scaled_scores(reviews_nlp['review'].tolist())
    reviews_nlp['roberta_summary_polarity'] = roberta_scaled_scores(reviews_nlp['summary'].tolist())

    # === BERT (batch) ===
    reviews_nlp['bert_review_polarity'] = bert_scaled_scores(reviews_nlp['review'].tolist())
    reviews_nlp['bert_summary_polarity'] = bert_scaled_scores(reviews_nlp['summary'].tolist())

    # Save for reuse
    reviews_nlp.to_csv(csv_path, index=False)
    print(f"Sentiment scores saved to {csv_path}")



In [None]:
%%skip_if no_NLP

# Create a figure with 2 rows and 3 columns to fit all five sentiment methods
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Flatten axes array for easy indexing
axes = axes.flatten()

# Plot TextBlob Review Polarity
sns.histplot(reviews_nlp['textblob_review_polarity'], bins=25, kde=True, stat="count", ax=axes[0], color='blue')
axes[0].set_title('TextBlob Review Polarity')

# Plot VADER Review Polarity
sns.histplot(reviews_nlp['vader_review_polarity'], bins=25, kde=True, stat="count", ax=axes[1], color='green')
axes[1].set_title('VADER Review Polarity')

# Plot Afinn Review Polarity
sns.histplot(reviews_nlp['afinn_review_polarity'], bins=25, kde=True, stat="count", ax=axes[2], color='red')
axes[2].set_title('Afinn Review Polarity')

# Plot RoBERTa Review Polarity
sns.histplot(reviews_nlp['roberta_review_polarity'], bins=25, kde=True, stat="count", ax=axes[3], color='purple')
axes[3].set_title('RoBERTa Review Polarity')

# Plot BERT Review Polarity
sns.histplot(reviews_nlp['bert_review_polarity'], bins=25, kde=True, stat="count", ax=axes[4], color='orange')
axes[4].set_title('BERT Review Polarity')

# Hide the 6th subplot (empty)
axes[5].axis('off')

plt.tight_layout()
plt.show()

In [None]:
%%skip_if no_NLP

# Create a figure with 2 rows and 3 columns to fit all five sentiment methods
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Flatten axes array for easy indexing
axes = axes.flatten()

# Plot TextBlob Review Polarity
sns.histplot(reviews_nlp['textblob_summary_polarity'], bins=25, kde=True, stat="count", ax=axes[0], color='blue')
axes[0].set_title('TextBlob Summary Polarity')

# Plot VADER Review Polarity
sns.histplot(reviews_nlp['vader_summary_polarity'], bins=25, kde=True, stat="count", ax=axes[1], color='green')
axes[1].set_title('VADER Summary Polarity')

# Plot Afinn Review Polarity
sns.histplot(reviews_nlp['afinn_summary_polarity'], bins=25, kde=True, stat="count", ax=axes[2], color='red')
axes[2].set_title('Afinn Summary Polarity')

# Plot RoBERTa Review Polarity
sns.histplot(reviews_nlp['roberta_summary_polarity'], bins=25, kde=True, stat="count", ax=axes[3], color='purple')
axes[3].set_title('RoBERTa Summary Polarity')

# Plot BERT Review Polarity
sns.histplot(reviews_nlp['bert_summary_polarity'], bins=25, kde=True, stat="count", ax=axes[4], color='orange')
axes[4].set_title('BERT Summary Polarity')

# Hide the 6th subplot (empty)
axes[5].axis('off')

plt.tight_layout()
plt.show()

In [None]:
%%skip_if no_NLP

review_fields = [
    'textblob_review_polarity',
    'vader_review_polarity',
    'afinn_review_polarity',
    'roberta_review_polarity',
    'bert_review_polarity'
]

reviews_nlp['combined_review_polarity'] = reviews_nlp.apply(
    lambda row: combined_sentiment_zero_aware(row, review_fields),
    axis=1
)

In [None]:
%%skip_if no_NLP

summary_fields = [
    'textblob_summary_polarity',
    'vader_summary_polarity',
    'afinn_summary_polarity',
    'roberta_summary_polarity',
    'bert_summary_polarity'
]

reviews_nlp['combined_summary_polarity'] = reviews_nlp.apply(
    lambda row: combined_sentiment_zero_aware(row, summary_fields),
    axis=1
)

In [None]:
reviews_nlp.loc[reviews_nlp['combined_summary_polarity'] == 0.9928, 'combined_summary_polarity'] = \
    reviews_nlp['combined_review_polarity']

In [None]:
# Create side-by-side plots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot combined_review_polarity
sns.histplot(reviews_nlp['combined_review_polarity'], bins=30, kde=True, ax=axes[0], color='#89b4fa')
axes[0].set_title("Combined Review Polarity")
axes[0].set_xlabel("Polarity Score")
axes[0].set_ylabel("Count")

# Plot combined_summary_polarity
sns.histplot(reviews_nlp['combined_summary_polarity'], bins=30, kde=True, ax=axes[1], color='#f38ba8')
axes[1].set_title("Combined Summary Polarity")
axes[1].set_xlabel("Polarity Score")
axes[1].set_ylabel("Count")

# Tidy layout
plt.tight_layout()
plt.show()

In [None]:
%%skip_if no_NLP

print("Top 20 Positive Reviews by Combined Review Polarity:")

top_20_reviews = reviews_nlp[['review', 'combined_review_polarity']].sort_values(
    by='combined_review_polarity', ascending=False
).head(20)

top_20_reviews


In [None]:
%%skip_if no_NLP

print("Top 20 Positive Reviews by Combined Review Polarity:")

top_20_reviews_negative = reviews_nlp[['review', 'combined_review_polarity']].sort_values(
    by='combined_review_polarity', ascending=True
).head(20)

top_20_reviews_negative

In [None]:
%%skip_if no_NLP

print("Top 20 Positive Reviews by Combined Review Polarity:")

top_20_summary = reviews_nlp[['summary', 'combined_summary_polarity']].sort_values(
    by='combined_summary_polarity', ascending=False
).head(20)

top_20_summary


In [None]:
%%skip_if no_NLP

print("Top 20 Positive Reviews by Combined Review Polarity:")

top_20_summary_negative = reviews_nlp[['summary', 'combined_summary_polarity']].sort_values(
    by='combined_summary_polarity', ascending=True
).head(20)

top_20_summary_negative


In [None]:

vec1 = reviews_nlp['combined_review_polarity'].fillna(0).values.reshape(1, -1)
vec2 = reviews_nlp['combined_summary_polarity'].fillna(0).values.reshape(1, -1)

cos_sim = cosine_similarity(vec1, vec2)[0][0]
print(f"Cosine Similarity: {cos_sim:.4f}")


In [None]:
mae = np.mean(np.abs(reviews_nlp['combined_review_polarity'] - reviews_nlp['combined_summary_polarity']))
print(f"Mean Absolute Difference: {mae:.4f}")


In [None]:
# Calculate absolute differences
reviews_nlp['polarity_diff'] = (
    abs(reviews_nlp['combined_review_polarity'] - reviews_nlp['combined_summary_polarity'])
)

# Filter out exact 0 differences
nonzero_diff = reviews_nlp[reviews_nlp['polarity_diff'] != 0]

# Plot
plt.figure(figsize=(8, 6))
sns.histplot(nonzero_diff['polarity_diff'], bins=40, kde=True, color="#f38ba8")
plt.title('Distribution of Polarity Difference (Review - Summary)\n(Excluding Zero Differences)')
plt.xlabel('Polarity Difference')
plt.ylabel('Frequency')
plt.axvline(0, linestyle='--', color='gray')
plt.show()


In [None]:
# Sort by polarity_diff in descending order
top_diff_reviews = reviews_nlp.sort_values(by='polarity_diff', ascending=False)

# Display top 10 (or any N you want)
top_diff_reviews[['review', 'summary', 'combined_review_polarity', 'combined_summary_polarity', 'polarity_diff']].head(100)


In [None]:
count = (reviews_nlp['polarity_diff'] > 0.5).sum()
print(f"Number of reviews where polarity difference exceeds 0.5: {count}")

In [None]:
rows_to_tag = reviews_nlp[reviews_nlp['polarity_diff'] > 0.5].copy()
rows_to_tag = rows_to_tag.sort_values(by='polarity_diff', ascending=False)
# rows_to_tag.to_csv('tagged_progress.csv', index=False)


In [None]:
# Load manual tags
tagged_df = pd.read_csv("tagged_progress.csv")

# Merge into main DataFrame
reviews_nlp = reviews_nlp.merge(tagged_df, left_index=True, right_on='index', how='left')



In [None]:
label_map = {'review': 0, 'between': 1, 'summary': 2}
train_df = reviews_nlp[reviews_nlp['manual_sentiment_tag'].isin(label_map.keys())].copy()
train_df['target'] = train_df['manual_sentiment_tag'].map(label_map)

In [None]:
features = [
    'textblob_review_polarity', 'vader_review_polarity', 'afinn_review_polarity',
    'roberta_review_polarity', 'bert_review_polarity',
    'textblob_summary_polarity', 'vader_summary_polarity', 'afinn_summary_polarity',
    'roberta_summary_polarity', 'bert_summary_polarity'
]


X_train = train_df[features]
y_train = train_df['target']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

X_all = reviews_nlp[features]
reviews_nlp['rf_predicted_label'] = rf_model.predict(X_all)

inv_label_map = {v: k for k, v in label_map.items()}
reviews_nlp['rf_predicted_tag'] = reviews_nlp['rf_predicted_label'].map(inv_label_map)


In [None]:
reviews_nlp['best_rf_polarity'] = reviews_nlp.apply(get_best_rf_polarity, axis=1)
reviews_cleaned['best_rf_polarity'] = reviews_nlp['best_rf_polarity'].values