In [2]:
import pandas
import numpy
from matplotlib import pyplot as plt
import seaborn as sns
import streamlit as st

In [3]:
data = pandas.read_csv('customer_shopping_behavior.csv')
df = pandas.DataFrame(data)
df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3863 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [5]:
df.describe(include='all')

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
count,3900.0,3900.0,3900,3900,3900,3900.0,3900,3900,3900,3900,3863.0,3900,3900,3900,3900,3900.0,3900,3900
unique,,,2,25,4,,50,4,25,4,,2,6,2,2,,6,7
top,,,Male,Blouse,Clothing,,Montana,M,Olive,Spring,,No,Free Shipping,No,No,,PayPal,Every 3 Months
freq,,,2652,171,1737,,96,1755,177,999,,2847,675,2223,2223,,677,584
mean,1950.5,44.068462,,,,59.764359,,,,,3.750065,,,,,25.351538,,
std,1125.977353,15.207589,,,,23.685392,,,,,0.716983,,,,,14.447125,,
min,1.0,18.0,,,,20.0,,,,,2.5,,,,,1.0,,
25%,975.75,31.0,,,,39.0,,,,,3.1,,,,,13.0,,
50%,1950.5,44.0,,,,60.0,,,,,3.8,,,,,25.0,,
75%,2925.25,57.0,,,,81.0,,,,,4.4,,,,,38.0,,


In [6]:
df.isna().sum()

Customer ID                0
Age                        0
Gender                     0
Item Purchased             0
Category                   0
Purchase Amount (USD)      0
Location                   0
Size                       0
Color                      0
Season                     0
Review Rating             37
Subscription Status        0
Shipping Type              0
Discount Applied           0
Promo Code Used            0
Previous Purchases         0
Payment Method             0
Frequency of Purchases     0
dtype: int64

In [7]:
df.dtypes

Customer ID                 int64
Age                         int64
Gender                     object
Item Purchased             object
Category                   object
Purchase Amount (USD)       int64
Location                   object
Size                       object
Color                      object
Season                     object
Review Rating             float64
Subscription Status        object
Shipping Type              object
Discount Applied           object
Promo Code Used            object
Previous Purchases          int64
Payment Method             object
Frequency of Purchases     object
dtype: object

In [8]:
df['Review Rating'] = df['Review Rating'].fillna(df['Review Rating'].mean())


In [9]:
df.isna().sum()

Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [10]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ','_')
df = df.rename(columns={'purchase_amount_(usd)': 'purchase_amount'})
df.columns

Index(['customer_id', 'age', 'gender', 'item_purchased', 'category',
       'purchase_amount', 'location', 'size', 'color', 'season',
       'review_rating', 'subscription_status', 'shipping_type',
       'discount_applied', 'promo_code_used', 'previous_purchases',
       'payment_method', 'frequency_of_purchases'],
      dtype='object')

In [11]:
# create a coloummn age_group
labels = ['young Adult','Adult','Middle-aged','Senior']
df['age_group'] = pandas.qcut(df['age'],q=4,labels= labels)

In [12]:
df[['age','age_group']].head(9)

Unnamed: 0,age,age_group
0,55,Middle-aged
1,19,young Adult
2,50,Middle-aged
3,21,young Adult
4,45,Middle-aged
5,46,Middle-aged
6,63,Senior
7,27,young Adult
8,26,young Adult


In [13]:
#  create purchase_frequency_days

frequency_mapping = {
    'Fortnightly': 14,
    'Weekly' : 7,
    'Montly' : 30,
    'Quarterly': 90,
    'Bi-Weekly': 14,
    'Annually': 365,
    'Every 3 month':90
}

df['purchase_frequency_days'] = df['frequency_of_purchases'].map(frequency_mapping)

In [14]:
df[['purchase_frequency_days','frequency_of_purchases']]

Unnamed: 0,purchase_frequency_days,frequency_of_purchases
0,14.0,Fortnightly
1,14.0,Fortnightly
2,7.0,Weekly
3,7.0,Weekly
4,365.0,Annually
...,...,...
3895,7.0,Weekly
3896,14.0,Bi-Weekly
3897,90.0,Quarterly
3898,7.0,Weekly


In [15]:
df[['discount_applied','promo_code_used']].head(10)

Unnamed: 0,discount_applied,promo_code_used
0,Yes,Yes
1,Yes,Yes
2,Yes,Yes
3,Yes,Yes
4,Yes,Yes
5,Yes,Yes
6,Yes,Yes
7,Yes,Yes
8,Yes,Yes
9,Yes,Yes


In [16]:
(df['discount_applied'] == df['promo_code_used']).all()

np.True_

In [17]:
df = df.drop('promo_code_used',axis=1)
df.dtypes

customer_id                   int64
age                           int64
gender                       object
item_purchased               object
category                     object
purchase_amount               int64
location                     object
size                         object
color                        object
season                       object
review_rating               float64
subscription_status          object
shipping_type                object
discount_applied             object
previous_purchases            int64
payment_method               object
frequency_of_purchases       object
age_group                  category
purchase_frequency_days     float64
dtype: object

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   customer_id              3900 non-null   int64   
 1   age                      3900 non-null   int64   
 2   gender                   3900 non-null   object  
 3   item_purchased           3900 non-null   object  
 4   category                 3900 non-null   object  
 5   purchase_amount          3900 non-null   int64   
 6   location                 3900 non-null   object  
 7   size                     3900 non-null   object  
 8   color                    3900 non-null   object  
 9   season                   3900 non-null   object  
 10  review_rating            3900 non-null   float64 
 11  subscription_status      3900 non-null   object  
 12  shipping_type            3900 non-null   object  
 13  discount_applied         3900 non-null   object  
 14  previous

In [20]:
df=df.drop(columns=['customer_id','purchase_frequency_days'],axis=1)


In [21]:
df['subscription_status'] = df['subscription_status'].map({"Yes":1, "No":0})
df['age_group'] = df['age_group'].map({"young Adult":1,"Adult":2,"Middle-aged":0,"Senior":3})


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


for i in list(df):
    if df[i].dtype == object:
        df[i] = le.fit_transform(df[i])


df.head(10)

Unnamed: 0,age,gender,item_purchased,category,purchase_amount,location,size,color,season,review_rating,subscription_status,shipping_type,discount_applied,previous_purchases,payment_method,frequency_of_purchases,age_group
0,55,1,2,1,53,16,0,7,3,3.1,1,1,1,14,5,3,0
1,19,1,23,1,64,18,0,12,3,3.1,1,1,1,2,1,3,1
2,50,1,11,1,73,20,2,12,1,3.1,1,2,1,23,2,6,0
3,21,1,14,2,90,38,1,12,1,3.5,1,3,1,49,4,6,1
4,45,1,2,1,49,36,1,21,1,2.7,1,2,1,31,4,0,0
5,46,1,20,2,20,49,1,23,2,2.9,1,4,1,14,5,6,0
6,63,1,16,1,85,25,1,7,0,3.2,1,2,1,49,1,5,3
7,27,1,18,1,34,17,0,4,3,3.2,1,2,1,19,2,6,1
8,26,1,4,3,97,47,0,19,2,2.6,1,1,1,8,5,0,1
9,57,1,7,0,31,24,1,16,1,4.8,1,0,1,4,1,5,0


In [23]:
y = df['subscription_status']

In [24]:
x = df.drop(columns=['subscription_status'])
x

Unnamed: 0,age,gender,item_purchased,category,purchase_amount,location,size,color,season,review_rating,shipping_type,discount_applied,previous_purchases,payment_method,frequency_of_purchases,age_group
0,55,1,2,1,53,16,0,7,3,3.1,1,1,14,5,3,0
1,19,1,23,1,64,18,0,12,3,3.1,1,1,2,1,3,1
2,50,1,11,1,73,20,2,12,1,3.1,2,1,23,2,6,0
3,21,1,14,2,90,38,1,12,1,3.5,3,1,49,4,6,1
4,45,1,2,1,49,36,1,21,1,2.7,2,1,31,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,40,0,9,1,28,45,0,21,2,4.2,0,0,32,5,6,2
3896,52,0,0,0,49,14,0,23,1,4.5,5,0,41,0,1,0
3897,46,0,1,0,33,29,0,8,1,2.9,4,0,24,5,5,0
3898,44,0,17,2,77,22,2,3,2,3.8,1,0,24,5,6,2


In [43]:
df.to_csv("new_df.csv",index=False)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   age                     3900 non-null   int64   
 1   gender                  3900 non-null   int64   
 2   item_purchased          3900 non-null   int64   
 3   category                3900 non-null   int64   
 4   purchase_amount         3900 non-null   int64   
 5   location                3900 non-null   int64   
 6   size                    3900 non-null   int64   
 7   color                   3900 non-null   int64   
 8   season                  3900 non-null   int64   
 9   review_rating           3900 non-null   float64 
 10  subscription_status     3900 non-null   int64   
 11  shipping_type           3900 non-null   int64   
 12  discount_applied        3900 non-null   int64   
 13  previous_purchases      3900 non-null   int64   
 14  payment_method          

In [26]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [27]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
y_pred

array([1, 0, 1, ..., 1, 0, 0], shape=(1170,))

In [28]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.8273504273504273


In [29]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[694 150]
 [ 52 274]]
