In [6]:
import pandas as pd

# Ganti path dengan salah satu dari dua pendekatan di atas
file_path = 'C:/Users/DELL/Downloads/Data/shopping_trends_updated.csv'

# Membaca file CSV ke dalam DataFrame
df = pd.read_csv(file_path)

# Tampilkan beberapa baris pertama dataset
print(df.head())


   Customer ID  Age Gender Item Purchased  Category  Purchase Amount (USD)  \
0            1   55   Male         Blouse  Clothing                     53   
1            2   19   Male        Sweater  Clothing                     64   
2            3   50   Male          Jeans  Clothing                     73   
3            4   21   Male        Sandals  Footwear                     90   
4            5   45   Male         Blouse  Clothing                     49   

        Location Size      Color  Season  Review Rating Subscription Status  \
0       Kentucky    L       Gray  Winter            3.1                 Yes   
1          Maine    L     Maroon  Winter            3.1                 Yes   
2  Massachusetts    S     Maroon  Spring            3.1                 Yes   
3   Rhode Island    M     Maroon  Spring            3.5                 Yes   
4         Oregon    M  Turquoise  Spring            2.7                 Yes   

   Shipping Type Discount Applied Promo Code Used  Previ

In [8]:
df.shape

(3900, 18)

In [9]:
df.info

<bound method DataFrame.info of       Customer ID  Age  Gender Item Purchased     Category  \
0               1   55    Male         Blouse     Clothing   
1               2   19    Male        Sweater     Clothing   
2               3   50    Male          Jeans     Clothing   
3               4   21    Male        Sandals     Footwear   
4               5   45    Male         Blouse     Clothing   
...           ...  ...     ...            ...          ...   
3895         3896   40  Female         Hoodie     Clothing   
3896         3897   52  Female       Backpack  Accessories   
3897         3898   46  Female           Belt  Accessories   
3898         3899   44  Female          Shoes     Footwear   
3899         3900   52  Female        Handbag  Accessories   

      Purchase Amount (USD)       Location Size      Color  Season  \
0                        53       Kentucky    L       Gray  Winter   
1                        64          Maine    L     Maroon  Winter   
2            

In [11]:
df.isnull().sum()

Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [12]:
df.duplicated().sum()

0

In [13]:
#Data Preprocessing and Cleaning

In [27]:
# Mapping state names to state codes
state_codes = {
    'ALABAMA': 'AL', 'ALASKA': 'AK', 'ARIZONA': 'AZ', 'ARKANSAS': 'AR', 'CALIFORNIA': 'CA',
    'COLORADO': 'CO', 'CONNECTICUT': 'CT', 'DELAWARE': 'DE', 'FLORIDA': 'FL', 'GEORGIA': 'GA',
    'HAWAII': 'HI', 'IDAHO': 'ID', 'ILLINOIS': 'IL', 'INDIANA': 'IN', 'IOWA': 'IA',
    'KANSAS': 'KS', 'KENTUCKY': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD',
    'MASSACHUSETTS': 'MA', 'MICHIGAN': 'MI', 'MINNESOTA': 'MN', 'MISSISSIPPI': 'MS',
    'MISSOURI': 'MO', 'MONTANA': 'MT', 'NEBRASKA': 'NE', 'NEVADA': 'NV', 'NEW HAMPSHIRE': 'NH',
    'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'NEW YORK': 'NY', 'NORTH CAROLINA': 'NC',
    'NORTH DAKOTA': 'ND', 'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OREGON': 'OR', 'PENNSYLVANIA': 'PA',
    'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'TENNESSEE': 'TN',
    'TEXAS': 'TX', 'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA',
    'WEST VIRGINIA': 'WV', 'WISCONSIN': 'WI', 'WYOMING': 'WY'
}

df['State Code'] = df['Location'].str.upper().map(state_codes)

# Checking the transformation
transformed_data_check = df[['Location', 'State Code']].drop_duplicates().sort_values(by='Location')

print(transformed_data_check)

           Location State Code
16          Alabama         AL
41           Alaska         AK
39          Arizona         AZ
10         Arkansas         AR
22       California         CA
33         Colorado         CO
67      Connecticut         CT
12         Delaware         DE
24          Florida         FL
72          Georgia         GA
11           Hawaii         HI
60            Idaho         ID
36         Illinois         IL
37          Indiana         IN
79             Iowa         IA
32           Kansas         KS
0          Kentucky         KY
7         Louisiana         LA
1             Maine         ME
50         Maryland         MD
2     Massachusetts         MA
265        Michigan         MI
106       Minnesota         MN
17      Mississippi         MS
9          Missouri         MO
6           Montana         MT
78         Nebraska         NE
26           Nevada         NV
13    New Hampshire         NH
49       New Jersey         NJ
54       New Mexico         NM
14      

In [17]:
#Categorizing clothing items for future analysis

In [18]:
def categorize_item(item):
    category_mapping = {
        'Tops': ['Blouse', 'Coat', 'Dress', 'Gloves', 'Hoodie', 'Jacket', 'Shirt', 'Sweater', 'T-shirt'],
        'Bottoms': ['Jeans', 'Pants', 'Shorts', 'Skirt'],
        'Footwear': ['Boots', 'Sandals', 'Shoes', 'Sneakers'],
        'Accessories': ['Belt', 'Handbag', 'Hat', 'Jewelry', 'Sunglasses']
    }

    for category, items in category_mapping.items():
        if item in items:
            return category

    return 'Other'

# Applying the categorization to the dataset
df['Item Category'] = df['Item Purchased'].apply(categorize_item)


In [36]:
import plotly.express as px

# Membuat pemetaan dari State Code ke Location
state_location_mapping = df.groupby('State Code')['Location'].first().reset_index()

# Visualisasi 1: Segmentasi Pelanggan Geografis
customer_distribution = df['State Code'].value_counts().reset_index()
customer_distribution.columns = ['State Code', 'Number of Customers']

# Menggabungkan dengan pemetaan untuk mendapatkan Location
customer_distribution = pd.merge(customer_distribution, state_location_mapping, on='State Code')

fig1 = px.choropleth(
    customer_distribution,
    locations='State Code',
    locationmode="USA-states",
    color='Number of Customers',
    hover_data=['Location', 'Number of Customers'],  # Menambahkan 'Location' ke dalam hover_data
    scope="usa",
    title='Distribution of Customers Across US States',
    color_continuous_scale='RdBu'
)
fig1.show()