<a href="https://colab.research.google.com/github/uginm102/MCSC-MUK/blob/main/MCS7227%20Data%20Analytics%20and%20Visualization/MCS_7227_Data_Analytics_and_Visualization_Amazon_Metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analyzing Product Popularity and Customer Preferences in Amazon Metadata

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# File path to the cleaned dataset
input_file = "/content/drive/MyDrive/Personal/Masters/Msc Comp Sc - MUK/Year 1/Sem 2/MCS7227 Data Analytics and Visualization/amazon_sample.csv"

In [4]:
# Step 1: Load the cleaned dataset
df = pd.read_csv(input_file)

# Step 2: Compute dataset statistics
# 2.1: Category distribution (percentage of "Books")
category_counts = df['category'].value_counts(normalize=True) * 100  # Percentage
books_percentage = category_counts.get('Books', 0)  # Default to 0 if 'Books' not found
print(f"Percentage of 'Books': {books_percentage:.1f}%")

# 2.2: Average rating (mean of non-null avg_rating)
avg_rating = df['avg_rating'].mean()
print(f"Average rating: {avg_rating:.1f}/5")

# 2.3: Median sales rank (median of non-null salesrank, excluding -1)
valid_salesrank = df[df['salesrank'] != -1]['salesrank']  # Filter out invalid ranks
median_salesrank = valid_salesrank.median()
print(f"Median sales rank: {median_salesrank:,.0f}")

# Optional: Additional summary stats for verification
print("\nAdditional Statistics:")
print(f"Total products: {len(df)}")
print(f"Products with reviews: {len(df[df['review_count'] > 0])}")
print(f"Products with valid sales rank: {len(valid_salesrank)}")

# Step 3: Save statistics to a text file (optional)
with open("dataset_stats.txt", "w") as f:
    f.write(f"Percentage of 'Books': {books_percentage:.1f}%\n")
    f.write(f"Average rating: {avg_rating:.1f}/5\n")
    f.write(f"Median sales rank: {median_salesrank:,.0f}\n")
print(f"Statistics saved to dataset_stats.txt")

Percentage of 'Books': 0.0%
Average rating: 3.2/5
Median sales rank: 300,963

Additional Statistics:
Total products: 548552
Products with reviews: 402734
Products with valid sales rank: 542225
Statistics saved to dataset_stats.txt


In [7]:
# Step 1: Load the cleaned dataset
df = pd.read_csv(input_file)

# Step 2: Prepare features and target
# Filter out invalid sales ranks (-1) if any remain
df = df[df['salesrank'] != -1]

# Use log_salesrank as target (assumes OpenRefine created this column)
# If not, compute it here (uncomment if needed)
df['log_salesrank'] = np.log(df['salesrank'].replace(0, 1))  # Avoid log(0)

# Features: review_count, avg_rating, and one-hot encoded category
X_numeric = df[['review_count', 'avg_rating']].fillna({'avg_rating': 0})  # Impute NaN avg_rating with 0
X_categorical = pd.get_dummies(df['category'], prefix='cat')  # One-hot encode category
X = pd.concat([X_numeric, X_categorical], axis=1)

# Target: log_salesrank
y = df['log_salesrank']

# Step 3: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Standardize numeric features (review_count, avg_rating)
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[['review_count', 'avg_rating']])
X_test_numeric = scaler.transform(X_test[['review_count', 'avg_rating']])

# Recombine with categorical features (not scaled)
X_train_scaled = np.hstack([X_train_numeric, X_train[X_categorical.columns].values])
X_test_scaled = np.hstack([X_test_numeric, X_test[X_categorical.columns].values])

# Step 5: Train the linear regression model
reg = LinearRegression()
reg.fit(X_train_scaled, y_train)

# Step 6: Make predictions and evaluate
y_pred = reg.predict(X_test_scaled)

# Compute metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:,.0f}")
print(f"R^2 Score: {r2:.2f}")

# Step 7: Analyze feature importance (coefficients)
feature_names = ['review_count', 'avg_rating'] + list(X_categorical.columns)
coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': reg.coef_
})
print("\nFeature Coefficients:")
print(coefficients.sort_values(by='Coefficient', ascending=False))

# Step 8: Save results (optional)
results = {
    'MSE': mse,
    'R2': r2,
    'Coefficients': coefficients.to_dict()
}
with open("regression_results.txt", "w") as f:
    f.write(f"MSE: {mse:,.0f}\n")
    f.write(f"R^2: {r2:.2f}\n")
    f.write("\nFeature Coefficients:\n")
    f.write(coefficients.to_string(index=False))
print("Results saved to regression_results.txt")

Mean Squared Error (MSE): 1
R^2 Score: 0.46

Feature Coefficients:
             Feature   Coefficient
3           cat_Book  3.428515e+00
6          cat_Music  2.154786e+00
4             cat_CE  1.651962e+00
10         cat_Video  6.753550e-01
5            cat_DVD  6.044932e-01
2   cat_Baby Product  1.332268e-15
0       review_count -1.983719e-01
1         avg_rating -4.099338e-01
8         cat_Sports -7.693609e-01
9            cat_Toy -1.300085e+00
7       cat_Software -2.736994e+00
11   cat_Video Games -3.708672e+00
Results saved to regression_results.txt


In [9]:
# Step 1: Load the cleaned dataset
df = pd.read_csv(input_file)

# Step 2: Prepare features for clustering
# Use review_count and avg_rating as clustering features
X = df[['review_count', 'avg_rating']].fillna({'avg_rating': 0})  # Impute NaN avg_rating with 0

# Step 3: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Determine optimal number of clusters using the elbow method
inertia = []
k_range = range(2, 11)  # Test k from 2 to 10
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)


Elbow curve saved to 'elbow_curve.png'


KeyboardInterrupt: 

In [None]:
# Plot elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal k')
# plt.savefig('elbow_curve.png')
plt.show()
# print("Elbow curve saved to 'elbow_curve.png'")

In [None]:


# Step 5: Perform k-means clustering with k=5 (chosen from elbow method)
k_optimal = 5
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to DataFrame
df['cluster'] = cluster_labels

In [None]:
# Step 6: Evaluate clustering with silhouette score
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score for k={k_optimal}: {silhouette_avg:.2f}")

In [None]:
# Step 7: Analyze cluster characteristics
cluster_summary = df.groupby('cluster').agg({
    'review_count': ['mean', 'count'],
    'avg_rating': 'mean',
    'category': lambda x: x.value_counts().index[0]  # Most common category
}).rename(columns={'<lambda>': 'top_category'})

print("\nCluster Summary:")
print(cluster_summary)

In [None]:
# Step 8: Infer cluster descriptions (manual interpretation)
cluster_descriptions = {
    0: "Low reviews, moderate ratings",
    1: "High-rated books",
    2: "Niche DVDs",
    3: "High review count, variable ratings",
    4: "Low-rated, few reviews"
}  # Adjust based on actual summary output

print("\nCluster Descriptions (Tentative):")
for i, desc in cluster_descriptions.items():
    print(f"Cluster {i}: {desc}")

In [None]:
# Step 9: Save results
df[['ASIN', 'review_count', 'avg_rating', 'category', 'cluster']].to_csv("clustered_data.csv", index=False)
with open("clustering_results.txt", "w") as f:
    f.write(f"Silhouette Score for k={k_optimal}: {silhouette_avg:.2f}\n\n")
    f.write("Cluster Summary:\n")
    f.write(cluster_summary.to_string())
    f.write("\n\nCluster Descriptions (Tentative):\n")
    for i, desc in cluster_descriptions.items():
        f.write(f"Cluster {i}: {desc}\n")
print("Results saved to 'clustered_data.csv' and 'clustering_results.txt'")