# E-commerce Customer Analysis

Brazilian E-commerce Public Dataset

Author: Christos Papakostas

## 1. Load and Prepare Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob

# Load datasets (example)
orders = pd.read_csv('olist_orders_dataset.csv', parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'])
order_items = pd.read_csv('olist_order_items_dataset.csv')
payments = pd.read_csv('olist_order_payments_dataset.csv')
customers = pd.read_csv('olist_customers_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
reviews = pd.read_csv('olist_order_reviews_dataset.csv', parse_dates=['review_creation_date', 'review_answer_timestamp'])

## 2. Feature Engineering (Delivery & Delay)

In [None]:
# Delivery timing
delivered_orders = orders[orders['order_status'] == 'delivered'].copy()
delivered_orders['delivery_days'] = (delivered_orders['order_delivered_customer_date'] - delivered_orders['order_purchase_timestamp']).dt.days
delivered_orders['estimated_days'] = (delivered_orders['order_estimated_delivery_date'] - delivered_orders['order_purchase_timestamp']).dt.days
delivered_orders['delay'] = delivered_orders['delivery_days'] - delivered_orders['estimated_days']

## 3. RFM Segmentation

In [None]:
# Assuming merged dataset contains order_purchase_timestamp, payment_value, order_id
latest_date = delivered_orders['order_purchase_timestamp'].max()

rfm_data = delivered_orders.merge(payments, on='order_id')
rfm = rfm_data.groupby('customer_unique_id').agg({
    'order_purchase_timestamp': lambda x: (latest_date - x.max()).days,
    'order_id': 'count',
    'payment_value': 'sum'
}).reset_index()

rfm.columns = ['customer_unique_id', 'recency', 'frequency', 'monetary']

rfm['r_score'] = pd.qcut(rfm['recency'], 5, labels=[5,4,3,2,1])
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm['m_score'] = pd.qcut(rfm['monetary'], 5, labels=[1,2,3,4,5])
rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)

## 4. Payment Type Analysis

In [None]:
payment_avg = payments.groupby('payment_type')['payment_value'].mean().sort_values(ascending=False).reset_index()
print(payment_avg)

## 5. Review Sentiment

In [None]:
reviews['has_comment'] = reviews['review_comment_message'].notna()
reviews['polarity'] = reviews['review_comment_message'].dropna().apply(lambda x: TextBlob(x).sentiment.polarity)