In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/amazon.csv")
df.head()


In [None]:
df.shape


In [None]:
df.columns


In [None]:
df.columns = df.columns.str.strip().str.replace(" ", "_")
df.columns


In [None]:
df = df.loc[:, ~df.columns.duplicated()]
df.columns


In [None]:
df.isnull().sum()


In [None]:
df['Product_Search_Method'] = df['Product_Search_Method'].fillna('Unknown')

In [None]:
rating_cols = [
    'Customer_Reviews_Importance',
    'Shopping_Satisfaction',
    'Rating_Accuracy'
]

for col in rating_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
df['Gender'] = df['Gender'].str.lower().str.strip()

df['Gender'] = df['Gender'].replace({
    'male': 'Male',
    'female': 'Female',
    'prefer not to say': 'Other'
})


In [None]:
df['Purchase_Frequency'] = df['Purchase_Frequency'].str.lower().str.strip()
df.columns = df.columns.str.strip().str.replace(" ", "_")


In [None]:
df['Improvement_Areas'] = df['Improvement_Areas'].str.lower().str.strip()


In [None]:
garbage_values = ['.', 'nill', 'nil', 'none', 'na', 'n/a', '']

df['Improvement_Areas'] = df['Improvement_Areas'].replace(garbage_values, 'Unknown')


In [None]:
df['Improvement_Areas'] = df['Improvement_Areas'].replace({
    'ui': 'User Interface',
    'user interface': 'User Interface',
    'user interfacee': 'User Interface',
    'user interface of app': 'User Interface',
    'app ui': 'User Interface'
})


In [None]:
df['Improvement_Areas'].value_counts()


In [None]:
import os
os.getcwd()

In [None]:
os.makedirs("../outputs", exist_ok=True)

In [None]:
df.to_csv("../outputs/cleaned_data.csv", index=False)

In [None]:
df['age'].describe()

In [None]:
df['age'].isnull().sum()


In [None]:
bins = [0, 24, 34, 44, 54, 100]
labels = ['18-24', '25-34', '35-44', '45-54', '55+']

df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)


In [None]:
df['age_group'].isnull().sum()


In [None]:
df[['age', 'age_group']].head(10)

In [None]:
df['Gender'] = df['Gender'].str.lower().str.strip()


In [None]:
df['Gender'] = df['Gender'].replace({'others': 'other'})
df['Gender'] = df['Gender'].str.title()


In [None]:
df['Gender'] = df['Gender'].str.lower().str.strip()
df['Gender'] = df['Gender'].replace({'others': 'other'})
df['Gender'] = df['Gender'].str.title()


In [None]:
df.to_csv("../outputs/cleaned_data_v2.csv", index=False)

In [None]:
# Task 2 :  Descriptive Behavior Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

df = pd.read_csv("../outputs/cleaned_data_v2.csv")


In [None]:
df.head()
df.info()


In [None]:
df['age_group'].value_counts()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(
    x='age_group',
    data=df,
    order=df['age_group'].value_counts().index
)
plt.title("Customer Distribution by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Customers")
plt.tight_layout()
plt.savefig("../outputs/charts/age_group_distribution.png")
plt.show()


In [None]:
df['Gender'].value_counts()


In [None]:
age_purchase = pd.crosstab(df['age_group'], df['Purchase_Frequency'])
age_purchase


In [None]:
age_purchase.plot(kind='bar', figsize=(8,5))
plt.title("Purchase Frequency by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Customers")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig("../outputs/charts/age_vs_purchase_frequency.png")
plt.show()


In [None]:
from collections import Counter

categories = df['Purchase_Categories'].dropna().str.split(',')
flat_categories = [c.strip() for sub in categories for c in sub]

category_counts = Counter(flat_categories)

top_categories = pd.DataFrame(
    category_counts.most_common(10),
    columns=['Product_Category', 'Count']
)

top_categories


In [None]:
plt.figure(figsize=(7,4))
sns.barplot(
    x='Count',
    y='Product_Category',
    data=top_categories
)
plt.title("Most Popular Product Categories")
plt.tight_layout()
plt.savefig("../outputs/charts/most_popular_categories.png")
plt.show()


In [None]:
browsing_methods = df['Product_Search_Method'].value_counts()
browsing_methods


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(
    y='Product_Search_Method',
    data=df,
    order=browsing_methods.index
)
plt.title("Top Product Browsing Methods")
plt.tight_layout()
plt.savefig("../outputs/charts/top_browsing_methods.png")
plt.show()


In [None]:
cart_abandonment = df['Cart_Abandonment_Factors'].value_counts()
cart_abandonment


In [None]:
plt.figure(figsize=(7,4))
sns.countplot(
    y='Cart_Abandonment_Factors',
    data=df,
    order=cart_abandonment.index
)
plt.title("Most Common Cart Abandonment Factors")
plt.tight_layout()
plt.savefig("../outputs/charts/cart_abandonment_factors.png")
plt.show()


In [None]:
metrics = df[
    ['Shopping_Satisfaction',
     'Recommendation_Helpfulness',
     'Rating_Accuracy']
]



In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(data=metrics)
plt.title("Distribution of Key Behavioral Ratings")
plt.tight_layout()
plt.savefig("../outputs/charts/behavioral_metrics_distribution.png")
plt.show()


In [None]:
# Recommendation
#Electronics and Fashion emerge as the most frequently purchased categories, indicating strong cross-selling potential.
#Search-based browsing dominates, reflecting high intent-driven shopping behavior.
#Pricing and delivery-related issues are the leading causes of cart abandonment.
#Average customer satisfaction and rating accuracy remain high, reinforcing trust in the platform.
#Recommendation helpfulness shows moderate variance, suggesting scope for personalization improvements.

In [None]:
#TASK 3: CUSTOMER SEGMENTATION & PROFILING
# Satisfaction level
df['Satisfaction_Level'] = pd.cut(
    df['Shopping_Satisfaction'],
    bins=[0, 2, 3, 5],
    labels=['Low', 'Medium', 'High']
)

In [None]:
df['Satisfaction_Level'].value_counts()

In [None]:
df['Frequency_Level'] = df['Purchase_Frequency'].map({
    'daily': 'High',
    'few times a week': 'High',
    'once a week': 'Medium',
    'once a month': 'Medium',
    'rarely': 'Low'
})

In [None]:
def assign_segment(row):
    if row['Frequency_Level'] == 'High' and row['Satisfaction_Level'] == 'High':
        return 'Frequent Buyers'
    elif row['Satisfaction_Level'] == 'Low' or row['Cart_Abandonment_Factors'] != 'None':
        return 'At-Risk Customers'
    else:
        return 'Occasional Shoppers'

df['Customer_Segment'] = df.apply(assign_segment, axis=1)


In [None]:
df['Customer_Segment'].value_counts()


In [None]:
pd.crosstab(df['Customer_Segment'], df['age_group'])

In [None]:
pd.crosstab(df['Customer_Segment'], df['Gender'])

In [None]:
df[['Shopping_Satisfaction', 'Rating_Accuracy', 'Recommendation_Helpfulness']].dtypes

In [None]:
df['Recommendation_Helpfulness_Num'] = df['Recommendation_Helpfulness'].map({
    'No': 1,
    'Sometimes': 2,
    'Yes': 3
})

In [None]:
df['Shopping_Satisfaction'] = pd.to_numeric(df['Shopping_Satisfaction'], errors='coerce')
df['Rating_Accuracy'] = pd.to_numeric(df['Rating_Accuracy'], errors='coerce')
df['Recommendation_Helpfulness_Num'] = pd.to_numeric(
    df['Recommendation_Helpfulness_Num'], errors='coerce'
)

In [None]:
df.groupby('Customer_Segment')[[
    'Shopping_Satisfaction',
    'Rating_Accuracy',
    'Recommendation_Helpfulness_Num'
]].mean()

In [None]:
df.rename(columns={
    'Recommendation_Helpfulness_Num': 'Recommendation_Helpfulness_Score'
}, inplace=True)


In [None]:
df.columns

In [None]:
df['Recommendation_Helpfulness_Num'] = df['Recommendation_Helpfulness'].map({
    'No': 1,
    'Sometimes': 2,
    'Yes': 3
})

In [None]:
df[['Recommendation_Helpfulness', 'Recommendation_Helpfulness_Num']].head()

In [None]:
df['Recommendation_Helpfulness_Num'].isnull().sum()

In [None]:
cluster_features = df[
    ['Shopping_Satisfaction',
     'Rating_Accuracy',
     'Recommendation_Helpfulness_Num']
].dropna()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
scaled_features = scaler.fit_transform(cluster_features)

kmeans = KMeans(n_clusters=3, random_state=42)
cluster_features['Cluster'] = kmeans.fit_predict(scaled_features)


In [None]:
df.loc[cluster_features.index, 'Behavior_Cluster'] = cluster_features['Cluster']

In [None]:
df.groupby('Behavior_Cluster')[[
    'Shopping_Satisfaction',
    'Rating_Accuracy',
    'Recommendation_Helpfulness_Num'
]].mean()


In [None]:
df.to_csv("../outputs/task3.csv", index=False)


In [None]:
#TASK 4: RECOMMENDATION & REVIEW INSIGHTS
df[['Recommendation_Helpfulness_Num', 'Shopping_Satisfaction']].corr()

In [None]:
import os

os.makedirs("../outputs/charts_task_4", exist_ok=True)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.scatterplot(
    x='Recommendation_Helpfulness_Num',
    y='Shopping_Satisfaction',
    data=df
)
plt.title("Recommendation Helpfulness vs Shopping Satisfaction")
plt.tight_layout()
plt.savefig("../outputs/charts_task_4/recommendation_helpfulness_vs_satisfaction.png")
plt.show()

In [None]:
df['Review_Reliability'].value_counts()

In [None]:
review_rating = df.groupby('Review_Reliability')['Rating_Accuracy'].mean()
review_rating

In [None]:
import os
os.makedirs("../outputs/charts_task_4", exist_ok=True)

plt.figure(figsize=(6,4))
sns.barplot(
    x='Review_Reliability',
    y='Rating_Accuracy',
    data=df,
    estimator='mean'
)
plt.title("Review Reliability vs Rating Accuracy")
plt.tight_layout()
plt.savefig("../outputs/charts_task_4/review_reliability_vs_rating_accuracy.png")
plt.show()

In [None]:
df['Personalized_Recommendation_Frequency'].value_counts()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(
    x='Personalized_Recommendation_Frequency',
    data=df,
    order=df['Personalized_Recommendation_Frequency'].value_counts().index
)
plt.title("Engagement with Personalized Recommendations")
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig("../outputs/charts_task_4/recommendation_engagement_frequency.png")
plt.show()

In [None]:
df.groupby('Personalized_Recommendation_Frequency')['Shopping_Satisfaction'].mean()

In [None]:
df.groupby('Customer_Segment')['Recommendation_Helpfulness_Num'].mean()

In [None]:
df.to_csv("../outputs/task_4.csv", index=False)

In [None]:
#recommendation
#Improve recommendation relevance for At-Risk customers to reduce churn
#Optimize recommendation frequency to avoid user fatigue
#Prioritize verified and reliable reviews in recommendation ranking
#Use behavioral clusters to personalize recommendations
#Enhance UI/UX to make recommendations clearer and less cluttered

In [None]:
#task 5 : Visualization and Reporting
import os
os.makedirs("../outputs/charts_task_5", exist_ok=True)

In [None]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

categories = df['Purchase_Categories'].dropna().str.split(',')
flat_categories = [c.strip() for sub in categories for c in sub]

category_counts = Counter(flat_categories)
top_categories = dict(category_counts.most_common(8))

plt.figure(figsize=(7,4))
sns.barplot(x=list(top_categories.values()), y=list(top_categories.keys()))
plt.title("Top Purchased Product Categories")
plt.xlabel("Number of Customers")
plt.ylabel("Category")
plt.tight_layout()
plt.savefig("../outputs/charts_task_5/purchase_categories.png")
plt.show()


In [None]:
df['Product_Search_Method'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os

os.makedirs("../outputs/charts_task_5", exist_ok=True)

plt.figure(figsize=(7,4))
sns.countplot(
    y='Product_Search_Method',
    data=df,
    order=df['Product_Search_Method'].value_counts().index
)
plt.title("Browsing Frequency Distribution (Product Search Methods)")
plt.xlabel("Number of Customers")
plt.ylabel("Browsing Method")
plt.tight_layout()
plt.savefig("../outputs/charts_task_5/browsing_frequency_distribution.png")
plt.show()


In [None]:
import os
os.listdir("../outputs/charts_task_5")

In [None]:
df['Shopping_Satisfaction'].value_counts().sort_index()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os

os.makedirs("../outputs/charts_task_5", exist_ok=True)

plt.figure(figsize=(6,4))
sns.histplot(
    df['Shopping_Satisfaction'],
    bins=5,
    kde=True
)
plt.title("Distribution of Shopping Satisfaction")
plt.xlabel("Satisfaction Rating")
plt.ylabel("Number of Customers")
plt.tight_layout()
plt.savefig("../outputs/charts_task_5/satisfaction_levels.png")
plt.show()

In [None]:
#Insights 
#Most customers report medium to high satisfaction levels
#Indicates overall positive shopping experience with scope for improvement among low-satisfaction users

In [None]:
os.listdir("../outputs/charts_task_5")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os

os.makedirs("../outputs/charts_task_5", exist_ok=True)

plt.figure(figsize=(6,4))
sns.scatterplot(
    x='Recommendation_Helpfulness_Num',
    y='Shopping_Satisfaction',
    data=df
)
plt.title("Recommendation Helpfulness vs Shopping Satisfaction")
plt.xlabel("Recommendation Helpfulness (Score)")
plt.ylabel("Shopping Satisfaction")
plt.tight_layout()
plt.savefig("../outputs/charts_task_5/recommendation_vs_satisfaction.png")
plt.show()


In [None]:
os.listdir("../outputs/charts_task_5")