In [1]:
#reads you CSV and turns it into a "Dataframe" (a table)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings # Warnings - control annoying messages
warnings.filterwarnings("ignore")
pd.set_option('display.max_column',None)
pd.set_option('display.max_row',200)
#matplotlib -> default char size is 12 wide X 6 tall (inches)
plt.rcParams['figure.figsize'] = (12,6)
sns.set_style("whitegrid") # Use the clean white grid background grid 
sns.set_palette('husl') #use a pleasant color palette

print(pd.__version__)
print(np.__version__)


2.2.3
2.2.2


In [None]:
#pd.read_csv() reads your CSV file line by line and bulds a Dataframe 
#'../' means "go one folder UP" from notebooks/ to reach data/
df = pd.read_csv("C:\pratice\ecommerce-customer-analytics\data\E-commerce_Customer_Behavior.csv")
print(df.shape) #Returns Row and Columns


(348, 12)


# First Peek at the Data

In [9]:
df.head() #Default print 5 row

Unnamed: 0.1,Unnamed: 0,customer_id,gender,age,city,membership_type,total_spend,items_purchased,average_rating,discount_applied,days_since_last_purchase,satisfaction_level
0,0,101,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
1,1,102,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
2,2,103,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
3,3,104,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
4,4,105,Male,27,Miami,Silver,720.4,13,4.0,True,55,Unsatisfied


In [11]:
df.tail() # Shows the last rows 

Unnamed: 0.1,Unnamed: 0,customer_id,gender,age,city,membership_type,total_spend,items_purchased,average_rating,discount_applied,days_since_last_purchase,satisfaction_level
343,345,446,Male,32,Miami,Silver,660.3,10,3.8,True,42,Unsatisfied
344,346,447,Female,36,Houston,Bronze,470.5,8,3.0,False,27,Neutral
345,347,448,Female,30,New York,Gold,1190.8,16,4.5,True,28,Satisfied
346,348,449,Male,34,Los Angeles,Silver,780.2,11,4.2,False,21,Neutral
347,349,450,Female,43,Chicago,Bronze,515.75,10,3.3,True,49,Unsatisfied


In [None]:
# sample(10) shows 10 RANDOM rows
# WHY? head() and tail() only show beginning and end
# Random sample gives you a more honest view of the MIDDLE of the data
# It's like checking random pages of a book, not just page 1 and last page

df.sample(10,random_state=42)
# random_state=42 means: always pick the SAME random rows
# This makes your analysis reproducible — same result every time you run it

Unnamed: 0.1,Unnamed: 0,customer_id,gender,age,city,membership_type,total_spend,items_purchased,average_rating,discount_applied,days_since_last_purchase,satisfaction_level
255,257,358,Female,31,New York,Gold,1160.6,15,4.5,True,29,Satisfied
114,115,216,Male,35,Los Angeles,Silver,810.9,12,4.3,False,13,Neutral
314,316,417,Female,37,Houston,Bronze,450.8,7,3.4,False,30,Neutral
268,270,371,Male,35,Los Angeles,Silver,800.9,12,4.1,False,17,Neutral
167,169,270,Male,34,Los Angeles,Silver,800.2,11,4.1,False,17,Neutral
124,125,226,Female,38,Houston,Bronze,440.9,8,3.2,False,24,Neutral
203,205,306,Female,42,Chicago,Bronze,495.25,10,3.5,True,35,Unsatisfied
224,226,327,Female,37,Houston,Bronze,430.8,7,3.4,False,23,Neutral
284,286,387,Female,36,Houston,Bronze,440.5,8,3.1,False,23,Neutral
153,155,256,Female,37,Houston,Bronze,430.8,7,3.4,False,23,Neutral


# UNDERSTAND THE COLUMNS DEEPLY

In [None]:
# .dtypes shows the DATA TYPE of each column
# WHY IS DATA TYPE IMPORTANT?
# You can't do math on text. You can't sort numbers stored as text.
# Data type tells Python HOW to treat each column.

print(df.dtypes)                        # WHAT THE TYPES MEAN:
                                        # int64   → Whole numbers (1, 2, 100, 500)
                                        #            Example: CustomerID, NumberOfPurchases, Age
                                        #
                                        # float64 → Decimal numbers (1.5, 99.99, 3.14)
                                        #            Example: TotalSpend, SatisfactionScore
                                        #
                                        # object  → Text / Mixed (stored as Python strings)
                                        #            Example: Gender, City, MembershipType, ProductCategory
                                        #
                                        # bool    → True or False
                                        #            Example: IsReturningCustomer
                                        #
                                        # datetime64 → Dates and times
                                        #            Example: LastPurchaseDate

Unnamed: 0                    int64
customer_id                   int64
gender                       object
age                           int64
city                         object
membership_type              object
total_spend                 float64
items_purchased               int64
average_rating              float64
discount_applied               bool
days_since_last_purchase      int64
satisfaction_level           object
dtype: object


 # THE FULL PICTURE IN ONE TABLE

In [16]:
summary = pd.DataFrame({
     'Column Name'    : df.columns,
     'Data Type'      : df.dtypes.values,
     'Total Values'   : len(df),
     'Non-Null Count' : df.count().values,       # How many have actual values
     'Null Count'     : df.isnull().sum().values, # How many are MISSING
     'Unique Values'  : [df[col].nunique() for col in df.columns],  # How many distinct values
     'Sample Value'   : [df[col].iloc[0] for col in df.columns]     # First actual value
})

# Add a "Null %" column so we can easily see problem columns
summary['Null %'] = (summary['Null Count'] / len(df) * 100).round(1)

# Sort by most nulls first — so problems jump out immediately
summary = summary.sort_values('Null Count', ascending=False)
print("COMPLETE COLUMN SUMMARY")
print(summary.to_string(index=False))

COMPLETE COLUMN SUMMARY
             Column Name Data Type  Total Values  Non-Null Count  Null Count  Unique Values Sample Value  Null %
              Unnamed: 0     int64           348             348           0            348            0     0.0
             customer_id     int64           348             348           0            348          101     0.0
                  gender    object           348             348           0              2       Female     0.0
                     age     int64           348             348           0             16           29     0.0
                    city    object           348             348           0              6     New York     0.0
         membership_type    object           348             348           0              3         Gold     0.0
             total_spend   float64           348             348           0             76       1120.2     0.0
         items_purchased     int64           348             348        

# Descriptive Statistics (Making Numbers Speak)

In [19]:
stats = df.describe().T     # .T means "transpose" — flip rows and columns
                              # So each ROW is a column from your data
                              # Makes it much easier to read
print(stats)                        

                          count        mean         std    min     25%    50%  \
Unnamed: 0                348.0  174.887931  101.304611    0.0   87.75  175.5   
customer_id               348.0  275.887931  101.304611  101.0  188.75  276.5   
age                       348.0   33.577586    4.878024   26.0   30.00   32.0   
total_spend               348.0  847.793103  361.692375  410.8  505.75  780.2   
items_purchased           348.0   12.632184    4.146079    7.0    9.00   12.0   
average_rating            348.0    4.023563    0.579145    3.0    3.50    4.1   
days_since_last_purchase  348.0   26.614943   13.474750    9.0   15.00   23.0   

                              75%     max  
Unnamed: 0                 262.25   349.0  
customer_id                363.25   450.0  
age                         37.00    43.0  
total_spend               1160.60  1520.1  
items_purchased             15.00    21.0  
average_rating               4.50     4.9  
days_since_last_purchase    38.00    63.0  