In [4]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('datasets/Superstore.csv')

# Standardize column names: lowercase, replace spaces/dashes with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')

# Ensure 'sales' column is numeric
df['sales'] = pd.to_numeric(df['sales'], errors='coerce')

# 1. First 4 transactions
first_4 = df[['order_id', 'order_date', 'sales', 'quantity', 'discount', 'profit']].head(4)
print('First 4 transactions:')
print(first_4)

# 2. Last 4 transactions
last_4 = df[['order_id', 'order_date', 'sales', 'quantity', 'discount', 'profit']].tail(4)
print('Last 4 transactions:')
print(last_4)

# 3. 4 random transactions (replace 123456 with your registration number)
random_4 = df[['order_id', 'order_date', 'sales', 'quantity', 'discount', 'profit']].sample(4, random_state=123456)
print('4 random transactions:')
print(random_4)

# 4. Code for column standardization (already shown above)

# 5. Total samples, columns, memory usage
total_samples = df.shape[0]
total_columns = df.shape[1]
memory_usage = df.memory_usage(deep=True).sum()
print('Total samples:', total_samples)
print('Total columns:', total_columns)
print('Memory usage (bytes):', memory_usage)

# 6. Number of float, int64, object columns
float_cols = df.select_dtypes(include='float').columns.size
int64_cols = df.select_dtypes(include='int64').columns.size
object_cols = df.select_dtypes(include='object').columns.size
print('Number of float columns:', float_cols)
print('Number of int64 columns:', int64_cols)
print('Number of object columns:', object_cols)

# 7. Summary of all categorical variables
categorical_summary = df.describe(include='object')
print('Categorical summary:')
print(categorical_summary)

# 8. Count, unique, top, freq for selected columns
selected_cols = ['order_date', 'ship_date', 'country', 'city', 'state', 'product_name']
selected_summary = df[selected_cols].describe()
print('Selected columns summary:')
print(selected_summary)

# 9. List segments and their counts
segment_counts = df['segment'].value_counts()
print('Segment counts:')
print(segment_counts)

# 10. List countries and their counts
country_counts = df['country'].value_counts()
print('Country counts:')
print(country_counts)

# 11. List regions and their counts
region_counts = df['region'].value_counts()
print('Region counts:')
print(region_counts)

# 12. Numeric column stats
numeric_stats = df.select_dtypes(include=['float', 'int']).describe()
print('Numeric column stats:')
print(numeric_stats)

# 13. Total sale by region
sales_by_region = df.groupby('region')['sales'].sum()
print('Total sales by region:')
print(sales_by_region)

# 14. Total profit by category
profit_by_category = df.groupby('category')['profit'].sum()
print('Total profit by category:')
print(profit_by_category)

# 15. Average sale by discount level
avg_sale_by_discount = df.groupby('discount')['sales'].mean()
print('Average sale by discount level:')
print(avg_sale_by_discount)

# 16. Average profit by ShipMode
avg_profit_by_shipmode = df.groupby('ship_mode')['profit'].mean()
print('Average profit by ShipMode:')
print(avg_profit_by_shipmode)

# 17. Number of orders by segment
orders_by_segment = df['segment'].value_counts()
print('Number of orders by segment:')
print(orders_by_segment)

# 18. Correlation check (single line)
correlation = df[['sales', 'profit', 'quantity', 'discount']].corr()
print('Correlation matrix:')
print(correlation)

# 19. Skewness of sales, profit, quantity, discount
sales_skew = df['sales'].skew()
profit_skew = df['profit'].skew()
quantity_skew = df['quantity'].skew()
discount_skew = df['discount'].skew()
print('Sales skewness:', sales_skew)
print('Profit skewness:', profit_skew)
print('Quantity skewness:', quantity_skew)
print('Discount skewness:', discount_skew)

# 20. Number of orders for each product where discount < 0.2
orders_per_product_discount_lt_20 = df[df['discount'] < 0.2].groupby('product_name')['order_id'].count()
print('Orders per product with discount < 0.2:')
print(orders_per_product_discount_lt_20)

# 21. Save total sales by state to CSV, and get sales for Alabama, Arizona, California
sales_by_state = df.groupby('state')['sales'].sum()
sales_by_state.to_csv('total_sales_by_state.csv')
alabama_sales = sales_by_state.get('Alabama', 0)
arizona_sales = sales_by_state.get('Arizona', 0)
california_sales = sales_by_state.get('California', 0)
print('Sales for Alabama:', alabama_sales)
print('Sales for Arizona:', arizona_sales)
print('Sales for California:', california_sales)

# 22. Most common discount level and its count
most_common_discount = df['discount'].mode()[0]
most_common_discount_count = (df['discount'] == most_common_discount).sum()
print('Most common discount level:', most_common_discount)
print('Count of most common discount level:', most_common_discount_count)

# 23. Region with highest total profit and amount
region_highest_profit = sales_by_region.idxmax()
highest_profit_amount = sales_by_region.max()
print('Region with highest total profit:', region_highest_profit)
print('Highest profit amount:', highest_profit_amount)

# 24. Does region with highest profit also have highest sales?
region_highest_sales = sales_by_region.idxmax()
highest_sales_amount = sales_by_region.max()
profit_ranking = sales_by_region.rank(ascending=False)
sales_ranking = sales_by_region.rank(ascending=False)
difference_in_ranking = profit_ranking[region_highest_sales] - sales_ranking[region_highest_sales]
print('Region with highest sales:', region_highest_sales)
print('Highest sales amount:', highest_sales_amount)
print('Difference in ranking:', difference_in_ranking)

# 25. Distribution of discount levels beyond top few, and unusual discounts
discount_counts = df['discount'].value_counts().sort_values(ascending=False)
unusual_discounts = discount_counts[discount_counts < 5]
print('Discount counts:')
print(discount_counts)
print('Unusual discounts (less than 5 occurrences):')
print(unusual_discounts)

First 4 transactions:
         order_id  order_date     sales  quantity  discount    profit
0  CA-2016-152156   11/8/2016  261.9600       2.0      0.00   41.9136
1  CA-2016-152156   11/8/2016  731.9400       3.0      0.00  219.5820
2  CA-2016-138688   6/12/2016   14.6200       2.0      0.00    6.8714
3  US-2015-108966  10/11/2015  957.5775       5.0      0.45 -383.0310
Last 4 transactions:
            order_id order_date    sales  quantity  discount   profit
9990  CA-2017-121258  2/26/2017   91.960       2.0       0.0  15.6332
9991  CA-2017-121258  2/26/2017  258.576       2.0       0.2  19.3932
9992  CA-2017-121258  2/26/2017   29.600       4.0       0.0  13.3200
9993  CA-2017-119914   5/4/2017  243.160       2.0       0.0  72.9480
4 random transactions:
            order_id  order_date    sales  quantity  discount    profit
3365  CA-2014-115161   1/31/2014  290.666       2.0      0.15    3.4196
2537  CA-2015-168746   1/27/2015  431.976       4.0      0.40 -100.7944
4078  CA-2015-1006