In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
sns.set()
# Filter out the warning about deprecated distplot
warnings.filterwarnings("ignore")

In [None]:
# Set the display options to show all columns and rows without truncation
pd.set_option('display.max_columns', None)

In [None]:
path = os.path.join(os.path.expanduser("~"), "project/data", "shopping_behavior_updated.csv")

In [None]:
df = pd.read_csv(path)

In [None]:
label = {}

In [None]:
def remove_outliers(df, column_name):
    # Step 1: Calculate the IQR for the specified column
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    # Step 2: Define the lower and upper bounds for outlier removal
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    # Step 3: Update the specified column with the cleaned data
    df[f'{column_name}c*'] = np.where(
        (df[column_name] >= lower_bound) & (df[column_name] <= upper_bound),
        df[column_name],
        np.nan
    )

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include="object")

# **<font size=21px color="solid red">INDEX</font>**

### [--Customer-ID](#Customer-ID)
### [--Age](#Age)
### [--Gender](#Gender)
### [--Item-Purchased](#Item-Purchased)
### [--Category](#Category)
### [--Purchase-Amount-(USD)](#Purchase-Amount-(USD))
### [--Location](#Location)
### [--Size](#Size)
### [--Color](#Color)
### [--Season](#Season)
### [--Review-Rating](#Review-Rating)
### [--Subscription-Status](#Subscription-Status)
### [--Shipping-Type](#Shipping-Type)
### [--Discount-Applied](#Discount-Applied)
### [--Promo-Code-Used](#Promo-Code-Used)
### [--Previous-Purchases](#Previous-Purchases)
### [--Payment-Method](#Payment-Method)
### [--Frequency-of-Purchases](#Frequency-of-Purchases)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Customer ID</font>**

In [None]:
print(f"{df['Customer ID'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Customer ID'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Customer ID'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Customer ID] == {}'.format(df['Customer ID'].isnull().sum()))

In [None]:
print(f'unique values in [Customer ID] == {df['Customer ID'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Customer ID'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Customer ID'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Customer ID'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Customer ID'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Customer ID'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Customer ID'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Customer ID = dict(zip(key,values)) # label data in 'key': value pair
# label['Customer ID'] = Customer ID # store data label dict in main dict actually it's updating/

In [None]:
# df['Customer ID'].replace(Customer ID, inplace=True)
# print('Modified DataFrame:')
# print(df['Customer ID'].head(10))

In [None]:
df['Customer ID'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Customer ID'])
plt.title('Distribution Plot for Customer ID')  # Title for the plot
plt.xlabel('Customer ID Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Customer ID'],orient='h')
plt.title('Box Plot for Customer ID')  # Title for the plot
plt.xlabel('Customer ID')  # X-axis label
plt.ylabel('Customer ID values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Customer ID'], inplace=True)

In [None]:
# mean = df['Customer ID'].mean()
# print(mean)

# median = df['Customer ID'].median()
# print(median)

# mode = df['Customer ID'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Customer ID'].fillna(df['Customer ID'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Customer ID'].fillna(df['Customer ID'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Customer ID'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Customer ID'].fillna(df['Customer ID'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Customer ID')

In [None]:
# print('null value in Customer IDc* == {}'.format(df['Customer ID']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Customer IDc*'].fillna(df['Customer IDc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Customer IDc*'].fillna(df['Customer IDc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Customer IDc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Customer IDc*'].fillna(df['Customer IDc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Customer ID] == {}'.format(df['Customer ID'].isnull().sum()))
# print('\n---\n')
# print('null value in Customer IDc* == {}'.format(df['Customer IDc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Customer ID] == {df['Customer ID'].dtype}')
# print('\n---\n')
# print(f'The dtype of Customer IDc* is {df['Customer IDc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Customer IDc*'])
# plt.title('Distribution Plot for Customer IDc*')  ## Title for the plot
# plt.xlabel('Customer IDc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Customer IDc*'],orient='h')
# plt.title('Box Plot for Customer IDc*')  ## Title for the plot
# plt.xlabel('Customer IDc*')  ## X-axis label
# plt.ylabel('Customer IDc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Age</font>**

In [None]:
print(f"{df['Age'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Age'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Age'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Age] == {}'.format(df['Age'].isnull().sum()))

In [None]:
print(f'unique values in [Age] == {df['Age'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Age'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Age'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Age'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Age'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Age'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Age'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Age = dict(zip(key,values)) # label data in 'key': value pair
# label['Age'] = Age # store data label dict in main dict actually it's updating/

In [None]:
# df['Age'].replace(Age, inplace=True)
# print('Modified DataFrame:')
# print(df['Age'].head(10))

In [None]:
df['Age'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Age'])
plt.title('Distribution Plot for Age')  # Title for the plot
plt.xlabel('Age Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Age'],orient='h')
plt.title('Box Plot for Age')  # Title for the plot
plt.xlabel('Age')  # X-axis label
plt.ylabel('Age values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Age'], inplace=True)

In [None]:
# mean = df['Age'].mean()
# print(mean)

# median = df['Age'].median()
# print(median)

# mode = df['Age'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Age'].fillna(df['Age'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Age'].fillna(df['Age'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Age'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Age'].fillna(df['Age'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Age')

In [None]:
# print('null value in Agec* == {}'.format(df['Age']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Agec*'].fillna(df['Agec*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Agec*'].fillna(df['Agec*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Agec*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Agec*'].fillna(df['Agec*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Age] == {}'.format(df['Age'].isnull().sum()))
# print('\n---\n')
# print('null value in Agec* == {}'.format(df['Agec*'].isnull().sum()))

In [None]:
print(f'The dtype of [Age] == {df['Age'].dtype}')
# print('\n---\n')
# print(f'The dtype of Agec* is {df['Agec*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Agec*'])
# plt.title('Distribution Plot for Agec*')  ## Title for the plot
# plt.xlabel('Agec* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Agec*'],orient='h')
# plt.title('Box Plot for Agec*')  ## Title for the plot
# plt.xlabel('Agec*')  ## X-axis label
# plt.ylabel('Agec* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Gender</font>**

In [None]:
print(f"{df['Gender'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Gender'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Gender'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Gender] == {}'.format(df['Gender'].isnull().sum()))

In [None]:
print(f'unique values in [Gender] == {df['Gender'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Gender'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Gender'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Gender'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Gender'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Gender'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Gender'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Gender = dict(zip(key,values)) # label data in 'key': value pair
# label['Gender'] = Gender # store data label dict in main dict actually it's updating/

In [None]:
# df['Gender'].replace(Gender, inplace=True)
# print('Modified DataFrame:')
# print(df['Gender'].head(10))

In [None]:
df['Gender'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Gender'])
plt.title('Distribution Plot for Gender')  # Title for the plot
plt.xlabel('Gender Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Gender'],orient='h')
plt.title('Box Plot for Gender')  # Title for the plot
plt.xlabel('Gender')  # X-axis label
plt.ylabel('Gender values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Gender'], inplace=True)

In [None]:
# mean = df['Gender'].mean()
# print(mean)

# median = df['Gender'].median()
# print(median)

# mode = df['Gender'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Gender'].fillna(df['Gender'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Gender'].fillna(df['Gender'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Gender'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Gender')

In [None]:
# print('null value in Genderc* == {}'.format(df['Gender']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Genderc*'].fillna(df['Genderc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Genderc*'].fillna(df['Genderc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Genderc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Genderc*'].fillna(df['Genderc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Gender] == {}'.format(df['Gender'].isnull().sum()))
# print('\n---\n')
# print('null value in Genderc* == {}'.format(df['Genderc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Gender] == {df['Gender'].dtype}')
# print('\n---\n')
# print(f'The dtype of Genderc* is {df['Genderc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Genderc*'])
# plt.title('Distribution Plot for Genderc*')  ## Title for the plot
# plt.xlabel('Genderc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Genderc*'],orient='h')
# plt.title('Box Plot for Genderc*')  ## Title for the plot
# plt.xlabel('Genderc*')  ## X-axis label
# plt.ylabel('Genderc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Item Purchased</font>**

In [None]:
print(f"{df['Item Purchased'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Item Purchased'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Item Purchased'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Item Purchased] == {}'.format(df['Item Purchased'].isnull().sum()))

In [None]:
print(f'unique values in [Item Purchased] == {df['Item Purchased'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Item Purchased'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Item Purchased'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Item Purchased'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Item Purchased'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Item Purchased'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Item Purchased'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Item Purchased = dict(zip(key,values)) # label data in 'key': value pair
# label['Item Purchased'] = Item Purchased # store data label dict in main dict actually it's updating/

In [None]:
# df['Item Purchased'].replace(Item Purchased, inplace=True)
# print('Modified DataFrame:')
# print(df['Item Purchased'].head(10))

In [None]:
df['Item Purchased'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Item Purchased'])
plt.title('Distribution Plot for Item Purchased')  # Title for the plot
plt.xlabel('Item Purchased Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Item Purchased'],orient='h')
plt.title('Box Plot for Item Purchased')  # Title for the plot
plt.xlabel('Item Purchased')  # X-axis label
plt.ylabel('Item Purchased values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Item Purchased'], inplace=True)

In [None]:
# mean = df['Item Purchased'].mean()
# print(mean)

# median = df['Item Purchased'].median()
# print(median)

# mode = df['Item Purchased'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Item Purchased'].fillna(df['Item Purchased'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Item Purchased'].fillna(df['Item Purchased'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Item Purchased'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Item Purchased'].fillna(df['Item Purchased'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Item Purchased')

In [None]:
# print('null value in Item Purchasedc* == {}'.format(df['Item Purchased']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Item Purchasedc*'].fillna(df['Item Purchasedc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Item Purchasedc*'].fillna(df['Item Purchasedc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Item Purchasedc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Item Purchasedc*'].fillna(df['Item Purchasedc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Item Purchased] == {}'.format(df['Item Purchased'].isnull().sum()))
# print('\n---\n')
# print('null value in Item Purchasedc* == {}'.format(df['Item Purchasedc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Item Purchased] == {df['Item Purchased'].dtype}')
# print('\n---\n')
# print(f'The dtype of Item Purchasedc* is {df['Item Purchasedc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Item Purchasedc*'])
# plt.title('Distribution Plot for Item Purchasedc*')  ## Title for the plot
# plt.xlabel('Item Purchasedc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Item Purchasedc*'],orient='h')
# plt.title('Box Plot for Item Purchasedc*')  ## Title for the plot
# plt.xlabel('Item Purchasedc*')  ## X-axis label
# plt.ylabel('Item Purchasedc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Category</font>**

In [None]:
print(f"{df['Category'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Category'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Category'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Category] == {}'.format(df['Category'].isnull().sum()))

In [None]:
print(f'unique values in [Category] == {df['Category'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Category'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Category'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Category'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Category'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Category'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Category'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Category = dict(zip(key,values)) # label data in 'key': value pair
# label['Category'] = Category # store data label dict in main dict actually it's updating/

In [None]:
# df['Category'].replace(Category, inplace=True)
# print('Modified DataFrame:')
# print(df['Category'].head(10))

In [None]:
df['Category'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Category'])
plt.title('Distribution Plot for Category')  # Title for the plot
plt.xlabel('Category Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Category'],orient='h')
plt.title('Box Plot for Category')  # Title for the plot
plt.xlabel('Category')  # X-axis label
plt.ylabel('Category values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Category'], inplace=True)

In [None]:
# mean = df['Category'].mean()
# print(mean)

# median = df['Category'].median()
# print(median)

# mode = df['Category'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Category'].fillna(df['Category'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Category'].fillna(df['Category'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Category'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Category'].fillna(df['Category'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Category')

In [None]:
# print('null value in Categoryc* == {}'.format(df['Category']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Categoryc*'].fillna(df['Categoryc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Categoryc*'].fillna(df['Categoryc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Categoryc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Categoryc*'].fillna(df['Categoryc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Category] == {}'.format(df['Category'].isnull().sum()))
# print('\n---\n')
# print('null value in Categoryc* == {}'.format(df['Categoryc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Category] == {df['Category'].dtype}')
# print('\n---\n')
# print(f'The dtype of Categoryc* is {df['Categoryc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Categoryc*'])
# plt.title('Distribution Plot for Categoryc*')  ## Title for the plot
# plt.xlabel('Categoryc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Categoryc*'],orient='h')
# plt.title('Box Plot for Categoryc*')  ## Title for the plot
# plt.xlabel('Categoryc*')  ## X-axis label
# plt.ylabel('Categoryc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Purchase Amount (USD)</font>**

In [None]:
print(f"{df['Purchase Amount (USD)'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Purchase Amount (USD)'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Purchase Amount (USD)'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Purchase Amount (USD)] == {}'.format(df['Purchase Amount (USD)'].isnull().sum()))

In [None]:
print(f'unique values in [Purchase Amount (USD)] == {df['Purchase Amount (USD)'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Purchase Amount (USD)'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Purchase Amount (USD)'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Purchase Amount (USD)'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Purchase Amount (USD)'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Purchase Amount (USD)'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Purchase Amount (USD)'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Purchase Amount (USD) = dict(zip(key,values)) # label data in 'key': value pair
# label['Purchase Amount (USD)'] = Purchase Amount (USD) # store data label dict in main dict actually it's updating/

In [None]:
# df['Purchase Amount (USD)'].replace(Purchase Amount (USD), inplace=True)
# print('Modified DataFrame:')
# print(df['Purchase Amount (USD)'].head(10))

In [None]:
df['Purchase Amount (USD)'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Purchase Amount (USD)'])
plt.title('Distribution Plot for Purchase Amount (USD)')  # Title for the plot
plt.xlabel('Purchase Amount (USD) Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Purchase Amount (USD)'],orient='h')
plt.title('Box Plot for Purchase Amount (USD)')  # Title for the plot
plt.xlabel('Purchase Amount (USD)')  # X-axis label
plt.ylabel('Purchase Amount (USD) values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Purchase Amount (USD)'], inplace=True)

In [None]:
# mean = df['Purchase Amount (USD)'].mean()
# print(mean)

# median = df['Purchase Amount (USD)'].median()
# print(median)

# mode = df['Purchase Amount (USD)'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Purchase Amount (USD)'].fillna(df['Purchase Amount (USD)'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Purchase Amount (USD)'].fillna(df['Purchase Amount (USD)'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Purchase Amount (USD)'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Purchase Amount (USD)'].fillna(df['Purchase Amount (USD)'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Purchase Amount (USD)')

In [None]:
# print('null value in Purchase Amount (USD)c* == {}'.format(df['Purchase Amount (USD)']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Purchase Amount (USD)c*'].fillna(df['Purchase Amount (USD)c*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Purchase Amount (USD)c*'].fillna(df['Purchase Amount (USD)c*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Purchase Amount (USD)c*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Purchase Amount (USD)c*'].fillna(df['Purchase Amount (USD)c*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Purchase Amount (USD)] == {}'.format(df['Purchase Amount (USD)'].isnull().sum()))
# print('\n---\n')
# print('null value in Purchase Amount (USD)c* == {}'.format(df['Purchase Amount (USD)c*'].isnull().sum()))

In [None]:
print(f'The dtype of [Purchase Amount (USD)] == {df['Purchase Amount (USD)'].dtype}')
# print('\n---\n')
# print(f'The dtype of Purchase Amount (USD)c* is {df['Purchase Amount (USD)c*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Purchase Amount (USD)c*'])
# plt.title('Distribution Plot for Purchase Amount (USD)c*')  ## Title for the plot
# plt.xlabel('Purchase Amount (USD)c* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Purchase Amount (USD)c*'],orient='h')
# plt.title('Box Plot for Purchase Amount (USD)c*')  ## Title for the plot
# plt.xlabel('Purchase Amount (USD)c*')  ## X-axis label
# plt.ylabel('Purchase Amount (USD)c* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Location</font>**

In [None]:
print(f"{df['Location'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Location'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Location'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Location] == {}'.format(df['Location'].isnull().sum()))

In [None]:
print(f'unique values in [Location] == {df['Location'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Location'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Location'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Location'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Location'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Location'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Location'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Location = dict(zip(key,values)) # label data in 'key': value pair
# label['Location'] = Location # store data label dict in main dict actually it's updating/

In [None]:
# df['Location'].replace(Location, inplace=True)
# print('Modified DataFrame:')
# print(df['Location'].head(10))

In [None]:
df['Location'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Location'])
plt.title('Distribution Plot for Location')  # Title for the plot
plt.xlabel('Location Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Location'],orient='h')
plt.title('Box Plot for Location')  # Title for the plot
plt.xlabel('Location')  # X-axis label
plt.ylabel('Location values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Location'], inplace=True)

In [None]:
# mean = df['Location'].mean()
# print(mean)

# median = df['Location'].median()
# print(median)

# mode = df['Location'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Location'].fillna(df['Location'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Location'].fillna(df['Location'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Location'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Location'].fillna(df['Location'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Location')

In [None]:
# print('null value in Locationc* == {}'.format(df['Location']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Locationc*'].fillna(df['Locationc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Locationc*'].fillna(df['Locationc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Locationc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Locationc*'].fillna(df['Locationc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Location] == {}'.format(df['Location'].isnull().sum()))
# print('\n---\n')
# print('null value in Locationc* == {}'.format(df['Locationc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Location] == {df['Location'].dtype}')
# print('\n---\n')
# print(f'The dtype of Locationc* is {df['Locationc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Locationc*'])
# plt.title('Distribution Plot for Locationc*')  ## Title for the plot
# plt.xlabel('Locationc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Locationc*'],orient='h')
# plt.title('Box Plot for Locationc*')  ## Title for the plot
# plt.xlabel('Locationc*')  ## X-axis label
# plt.ylabel('Locationc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Size</font>**

In [None]:
print(f"{df['Size'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Size'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Size'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Size] == {}'.format(df['Size'].isnull().sum()))

In [None]:
print(f'unique values in [Size] == {df['Size'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Size'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Size'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Size'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Size'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Size'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Size'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Size = dict(zip(key,values)) # label data in 'key': value pair
# label['Size'] = Size # store data label dict in main dict actually it's updating/

In [None]:
# df['Size'].replace(Size, inplace=True)
# print('Modified DataFrame:')
# print(df['Size'].head(10))

In [None]:
df['Size'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Size'])
plt.title('Distribution Plot for Size')  # Title for the plot
plt.xlabel('Size Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Size'],orient='h')
plt.title('Box Plot for Size')  # Title for the plot
plt.xlabel('Size')  # X-axis label
plt.ylabel('Size values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Size'], inplace=True)

In [None]:
# mean = df['Size'].mean()
# print(mean)

# median = df['Size'].median()
# print(median)

# mode = df['Size'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Size'].fillna(df['Size'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Size'].fillna(df['Size'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Size'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Size'].fillna(df['Size'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Size')

In [None]:
# print('null value in Sizec* == {}'.format(df['Size']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Sizec*'].fillna(df['Sizec*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Sizec*'].fillna(df['Sizec*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Sizec*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Sizec*'].fillna(df['Sizec*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Size] == {}'.format(df['Size'].isnull().sum()))
# print('\n---\n')
# print('null value in Sizec* == {}'.format(df['Sizec*'].isnull().sum()))

In [None]:
print(f'The dtype of [Size] == {df['Size'].dtype}')
# print('\n---\n')
# print(f'The dtype of Sizec* is {df['Sizec*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Sizec*'])
# plt.title('Distribution Plot for Sizec*')  ## Title for the plot
# plt.xlabel('Sizec* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Sizec*'],orient='h')
# plt.title('Box Plot for Sizec*')  ## Title for the plot
# plt.xlabel('Sizec*')  ## X-axis label
# plt.ylabel('Sizec* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Color</font>**

In [None]:
print(f"{df['Color'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Color'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Color'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Color] == {}'.format(df['Color'].isnull().sum()))

In [None]:
print(f'unique values in [Color] == {df['Color'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Color'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Color'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Color'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Color'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Color'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Color'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Color = dict(zip(key,values)) # label data in 'key': value pair
# label['Color'] = Color # store data label dict in main dict actually it's updating/

In [None]:
# df['Color'].replace(Color, inplace=True)
# print('Modified DataFrame:')
# print(df['Color'].head(10))

In [None]:
df['Color'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Color'])
plt.title('Distribution Plot for Color')  # Title for the plot
plt.xlabel('Color Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Color'],orient='h')
plt.title('Box Plot for Color')  # Title for the plot
plt.xlabel('Color')  # X-axis label
plt.ylabel('Color values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Color'], inplace=True)

In [None]:
# mean = df['Color'].mean()
# print(mean)

# median = df['Color'].median()
# print(median)

# mode = df['Color'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Color'].fillna(df['Color'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Color'].fillna(df['Color'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Color'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Color'].fillna(df['Color'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Color')

In [None]:
# print('null value in Colorc* == {}'.format(df['Color']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Colorc*'].fillna(df['Colorc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Colorc*'].fillna(df['Colorc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Colorc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Colorc*'].fillna(df['Colorc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Color] == {}'.format(df['Color'].isnull().sum()))
# print('\n---\n')
# print('null value in Colorc* == {}'.format(df['Colorc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Color] == {df['Color'].dtype}')
# print('\n---\n')
# print(f'The dtype of Colorc* is {df['Colorc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Colorc*'])
# plt.title('Distribution Plot for Colorc*')  ## Title for the plot
# plt.xlabel('Colorc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Colorc*'],orient='h')
# plt.title('Box Plot for Colorc*')  ## Title for the plot
# plt.xlabel('Colorc*')  ## X-axis label
# plt.ylabel('Colorc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Season</font>**

In [None]:
print(f"{df['Season'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Season'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Season'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Season] == {}'.format(df['Season'].isnull().sum()))

In [None]:
print(f'unique values in [Season] == {df['Season'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Season'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Season'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Season'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Season'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Season'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Season'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Season = dict(zip(key,values)) # label data in 'key': value pair
# label['Season'] = Season # store data label dict in main dict actually it's updating/

In [None]:
# df['Season'].replace(Season, inplace=True)
# print('Modified DataFrame:')
# print(df['Season'].head(10))

In [None]:
df['Season'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Season'])
plt.title('Distribution Plot for Season')  # Title for the plot
plt.xlabel('Season Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Season'],orient='h')
plt.title('Box Plot for Season')  # Title for the plot
plt.xlabel('Season')  # X-axis label
plt.ylabel('Season values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Season'], inplace=True)

In [None]:
# mean = df['Season'].mean()
# print(mean)

# median = df['Season'].median()
# print(median)

# mode = df['Season'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Season'].fillna(df['Season'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Season'].fillna(df['Season'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Season'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Season'].fillna(df['Season'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Season')

In [None]:
# print('null value in Seasonc* == {}'.format(df['Season']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Seasonc*'].fillna(df['Seasonc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Seasonc*'].fillna(df['Seasonc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Seasonc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Seasonc*'].fillna(df['Seasonc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Season] == {}'.format(df['Season'].isnull().sum()))
# print('\n---\n')
# print('null value in Seasonc* == {}'.format(df['Seasonc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Season] == {df['Season'].dtype}')
# print('\n---\n')
# print(f'The dtype of Seasonc* is {df['Seasonc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Seasonc*'])
# plt.title('Distribution Plot for Seasonc*')  ## Title for the plot
# plt.xlabel('Seasonc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Seasonc*'],orient='h')
# plt.title('Box Plot for Seasonc*')  ## Title for the plot
# plt.xlabel('Seasonc*')  ## X-axis label
# plt.ylabel('Seasonc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Review Rating</font>**

In [None]:
print(f"{df['Review Rating'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Review Rating'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Review Rating'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Review Rating] == {}'.format(df['Review Rating'].isnull().sum()))

In [None]:
print(f'unique values in [Review Rating] == {df['Review Rating'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Review Rating'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Review Rating'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Review Rating'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Review Rating'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Review Rating'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Review Rating'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Review Rating = dict(zip(key,values)) # label data in 'key': value pair
# label['Review Rating'] = Review Rating # store data label dict in main dict actually it's updating/

In [None]:
# df['Review Rating'].replace(Review Rating, inplace=True)
# print('Modified DataFrame:')
# print(df['Review Rating'].head(10))

In [None]:
df['Review Rating'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Review Rating'])
plt.title('Distribution Plot for Review Rating')  # Title for the plot
plt.xlabel('Review Rating Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Review Rating'],orient='h')
plt.title('Box Plot for Review Rating')  # Title for the plot
plt.xlabel('Review Rating')  # X-axis label
plt.ylabel('Review Rating values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Review Rating'], inplace=True)

In [None]:
# mean = df['Review Rating'].mean()
# print(mean)

# median = df['Review Rating'].median()
# print(median)

# mode = df['Review Rating'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Review Rating'].fillna(df['Review Rating'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Review Rating'].fillna(df['Review Rating'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Review Rating'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Review Rating'].fillna(df['Review Rating'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Review Rating')

In [None]:
# print('null value in Review Ratingc* == {}'.format(df['Review Rating']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Review Ratingc*'].fillna(df['Review Ratingc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Review Ratingc*'].fillna(df['Review Ratingc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Review Ratingc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Review Ratingc*'].fillna(df['Review Ratingc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Review Rating] == {}'.format(df['Review Rating'].isnull().sum()))
# print('\n---\n')
# print('null value in Review Ratingc* == {}'.format(df['Review Ratingc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Review Rating] == {df['Review Rating'].dtype}')
# print('\n---\n')
# print(f'The dtype of Review Ratingc* is {df['Review Ratingc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Review Ratingc*'])
# plt.title('Distribution Plot for Review Ratingc*')  ## Title for the plot
# plt.xlabel('Review Ratingc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Review Ratingc*'],orient='h')
# plt.title('Box Plot for Review Ratingc*')  ## Title for the plot
# plt.xlabel('Review Ratingc*')  ## X-axis label
# plt.ylabel('Review Ratingc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Subscription Status</font>**

In [None]:
print(f"{df['Subscription Status'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Subscription Status'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Subscription Status'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Subscription Status] == {}'.format(df['Subscription Status'].isnull().sum()))

In [None]:
print(f'unique values in [Subscription Status] == {df['Subscription Status'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Subscription Status'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Subscription Status'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Subscription Status'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Subscription Status'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Subscription Status'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Subscription Status'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Subscription Status = dict(zip(key,values)) # label data in 'key': value pair
# label['Subscription Status'] = Subscription Status # store data label dict in main dict actually it's updating/

In [None]:
# df['Subscription Status'].replace(Subscription Status, inplace=True)
# print('Modified DataFrame:')
# print(df['Subscription Status'].head(10))

In [None]:
df['Subscription Status'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Subscription Status'])
plt.title('Distribution Plot for Subscription Status')  # Title for the plot
plt.xlabel('Subscription Status Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Subscription Status'],orient='h')
plt.title('Box Plot for Subscription Status')  # Title for the plot
plt.xlabel('Subscription Status')  # X-axis label
plt.ylabel('Subscription Status values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Subscription Status'], inplace=True)

In [None]:
# mean = df['Subscription Status'].mean()
# print(mean)

# median = df['Subscription Status'].median()
# print(median)

# mode = df['Subscription Status'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Subscription Status'].fillna(df['Subscription Status'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Subscription Status'].fillna(df['Subscription Status'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Subscription Status'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Subscription Status'].fillna(df['Subscription Status'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Subscription Status')

In [None]:
# print('null value in Subscription Statusc* == {}'.format(df['Subscription Status']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Subscription Statusc*'].fillna(df['Subscription Statusc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Subscription Statusc*'].fillna(df['Subscription Statusc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Subscription Statusc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Subscription Statusc*'].fillna(df['Subscription Statusc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Subscription Status] == {}'.format(df['Subscription Status'].isnull().sum()))
# print('\n---\n')
# print('null value in Subscription Statusc* == {}'.format(df['Subscription Statusc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Subscription Status] == {df['Subscription Status'].dtype}')
# print('\n---\n')
# print(f'The dtype of Subscription Statusc* is {df['Subscription Statusc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Subscription Statusc*'])
# plt.title('Distribution Plot for Subscription Statusc*')  ## Title for the plot
# plt.xlabel('Subscription Statusc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Subscription Statusc*'],orient='h')
# plt.title('Box Plot for Subscription Statusc*')  ## Title for the plot
# plt.xlabel('Subscription Statusc*')  ## X-axis label
# plt.ylabel('Subscription Statusc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Shipping Type</font>**

In [None]:
print(f"{df['Shipping Type'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Shipping Type'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Shipping Type'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Shipping Type] == {}'.format(df['Shipping Type'].isnull().sum()))

In [None]:
print(f'unique values in [Shipping Type] == {df['Shipping Type'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Shipping Type'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Shipping Type'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Shipping Type'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Shipping Type'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Shipping Type'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Shipping Type'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Shipping Type = dict(zip(key,values)) # label data in 'key': value pair
# label['Shipping Type'] = Shipping Type # store data label dict in main dict actually it's updating/

In [None]:
# df['Shipping Type'].replace(Shipping Type, inplace=True)
# print('Modified DataFrame:')
# print(df['Shipping Type'].head(10))

In [None]:
df['Shipping Type'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Shipping Type'])
plt.title('Distribution Plot for Shipping Type')  # Title for the plot
plt.xlabel('Shipping Type Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Shipping Type'],orient='h')
plt.title('Box Plot for Shipping Type')  # Title for the plot
plt.xlabel('Shipping Type')  # X-axis label
plt.ylabel('Shipping Type values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Shipping Type'], inplace=True)

In [None]:
# mean = df['Shipping Type'].mean()
# print(mean)

# median = df['Shipping Type'].median()
# print(median)

# mode = df['Shipping Type'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Shipping Type'].fillna(df['Shipping Type'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Shipping Type'].fillna(df['Shipping Type'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Shipping Type'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Shipping Type'].fillna(df['Shipping Type'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Shipping Type')

In [None]:
# print('null value in Shipping Typec* == {}'.format(df['Shipping Type']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Shipping Typec*'].fillna(df['Shipping Typec*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Shipping Typec*'].fillna(df['Shipping Typec*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Shipping Typec*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Shipping Typec*'].fillna(df['Shipping Typec*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Shipping Type] == {}'.format(df['Shipping Type'].isnull().sum()))
# print('\n---\n')
# print('null value in Shipping Typec* == {}'.format(df['Shipping Typec*'].isnull().sum()))

In [None]:
print(f'The dtype of [Shipping Type] == {df['Shipping Type'].dtype}')
# print('\n---\n')
# print(f'The dtype of Shipping Typec* is {df['Shipping Typec*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Shipping Typec*'])
# plt.title('Distribution Plot for Shipping Typec*')  ## Title for the plot
# plt.xlabel('Shipping Typec* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Shipping Typec*'],orient='h')
# plt.title('Box Plot for Shipping Typec*')  ## Title for the plot
# plt.xlabel('Shipping Typec*')  ## X-axis label
# plt.ylabel('Shipping Typec* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Discount Applied</font>**

In [None]:
print(f"{df['Discount Applied'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Discount Applied'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Discount Applied'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Discount Applied] == {}'.format(df['Discount Applied'].isnull().sum()))

In [None]:
print(f'unique values in [Discount Applied] == {df['Discount Applied'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Discount Applied'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Discount Applied'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Discount Applied'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Discount Applied'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Discount Applied'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Discount Applied'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Discount Applied = dict(zip(key,values)) # label data in 'key': value pair
# label['Discount Applied'] = Discount Applied # store data label dict in main dict actually it's updating/

In [None]:
# df['Discount Applied'].replace(Discount Applied, inplace=True)
# print('Modified DataFrame:')
# print(df['Discount Applied'].head(10))

In [None]:
df['Discount Applied'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Discount Applied'])
plt.title('Distribution Plot for Discount Applied')  # Title for the plot
plt.xlabel('Discount Applied Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Discount Applied'],orient='h')
plt.title('Box Plot for Discount Applied')  # Title for the plot
plt.xlabel('Discount Applied')  # X-axis label
plt.ylabel('Discount Applied values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Discount Applied'], inplace=True)

In [None]:
# mean = df['Discount Applied'].mean()
# print(mean)

# median = df['Discount Applied'].median()
# print(median)

# mode = df['Discount Applied'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Discount Applied'].fillna(df['Discount Applied'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Discount Applied'].fillna(df['Discount Applied'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Discount Applied'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Discount Applied'].fillna(df['Discount Applied'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Discount Applied')

In [None]:
# print('null value in Discount Appliedc* == {}'.format(df['Discount Applied']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Discount Appliedc*'].fillna(df['Discount Appliedc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Discount Appliedc*'].fillna(df['Discount Appliedc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Discount Appliedc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Discount Appliedc*'].fillna(df['Discount Appliedc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Discount Applied] == {}'.format(df['Discount Applied'].isnull().sum()))
# print('\n---\n')
# print('null value in Discount Appliedc* == {}'.format(df['Discount Appliedc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Discount Applied] == {df['Discount Applied'].dtype}')
# print('\n---\n')
# print(f'The dtype of Discount Appliedc* is {df['Discount Appliedc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Discount Appliedc*'])
# plt.title('Distribution Plot for Discount Appliedc*')  ## Title for the plot
# plt.xlabel('Discount Appliedc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Discount Appliedc*'],orient='h')
# plt.title('Box Plot for Discount Appliedc*')  ## Title for the plot
# plt.xlabel('Discount Appliedc*')  ## X-axis label
# plt.ylabel('Discount Appliedc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Promo Code Used</font>**

In [None]:
print(f"{df['Promo Code Used'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Promo Code Used'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Promo Code Used'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Promo Code Used] == {}'.format(df['Promo Code Used'].isnull().sum()))

In [None]:
print(f'unique values in [Promo Code Used] == {df['Promo Code Used'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Promo Code Used'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Promo Code Used'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Promo Code Used'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Promo Code Used'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Promo Code Used'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Promo Code Used'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Promo Code Used = dict(zip(key,values)) # label data in 'key': value pair
# label['Promo Code Used'] = Promo Code Used # store data label dict in main dict actually it's updating/

In [None]:
# df['Promo Code Used'].replace(Promo Code Used, inplace=True)
# print('Modified DataFrame:')
# print(df['Promo Code Used'].head(10))

In [None]:
df['Promo Code Used'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Promo Code Used'])
plt.title('Distribution Plot for Promo Code Used')  # Title for the plot
plt.xlabel('Promo Code Used Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Promo Code Used'],orient='h')
plt.title('Box Plot for Promo Code Used')  # Title for the plot
plt.xlabel('Promo Code Used')  # X-axis label
plt.ylabel('Promo Code Used values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Promo Code Used'], inplace=True)

In [None]:
# mean = df['Promo Code Used'].mean()
# print(mean)

# median = df['Promo Code Used'].median()
# print(median)

# mode = df['Promo Code Used'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Promo Code Used'].fillna(df['Promo Code Used'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Promo Code Used'].fillna(df['Promo Code Used'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Promo Code Used'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Promo Code Used'].fillna(df['Promo Code Used'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Promo Code Used')

In [None]:
# print('null value in Promo Code Usedc* == {}'.format(df['Promo Code Used']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Promo Code Usedc*'].fillna(df['Promo Code Usedc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Promo Code Usedc*'].fillna(df['Promo Code Usedc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Promo Code Usedc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Promo Code Usedc*'].fillna(df['Promo Code Usedc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Promo Code Used] == {}'.format(df['Promo Code Used'].isnull().sum()))
# print('\n---\n')
# print('null value in Promo Code Usedc* == {}'.format(df['Promo Code Usedc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Promo Code Used] == {df['Promo Code Used'].dtype}')
# print('\n---\n')
# print(f'The dtype of Promo Code Usedc* is {df['Promo Code Usedc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Promo Code Usedc*'])
# plt.title('Distribution Plot for Promo Code Usedc*')  ## Title for the plot
# plt.xlabel('Promo Code Usedc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Promo Code Usedc*'],orient='h')
# plt.title('Box Plot for Promo Code Usedc*')  ## Title for the plot
# plt.xlabel('Promo Code Usedc*')  ## X-axis label
# plt.ylabel('Promo Code Usedc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Previous Purchases</font>**

In [None]:
print(f"{df['Previous Purchases'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Previous Purchases'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Previous Purchases'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Previous Purchases] == {}'.format(df['Previous Purchases'].isnull().sum()))

In [None]:
print(f'unique values in [Previous Purchases] == {df['Previous Purchases'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Previous Purchases'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Previous Purchases'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Previous Purchases'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Previous Purchases'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Previous Purchases'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Previous Purchases'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Previous Purchases = dict(zip(key,values)) # label data in 'key': value pair
# label['Previous Purchases'] = Previous Purchases # store data label dict in main dict actually it's updating/

In [None]:
# df['Previous Purchases'].replace(Previous Purchases, inplace=True)
# print('Modified DataFrame:')
# print(df['Previous Purchases'].head(10))

In [None]:
df['Previous Purchases'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Previous Purchases'])
plt.title('Distribution Plot for Previous Purchases')  # Title for the plot
plt.xlabel('Previous Purchases Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Previous Purchases'],orient='h')
plt.title('Box Plot for Previous Purchases')  # Title for the plot
plt.xlabel('Previous Purchases')  # X-axis label
plt.ylabel('Previous Purchases values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Previous Purchases'], inplace=True)

In [None]:
# mean = df['Previous Purchases'].mean()
# print(mean)

# median = df['Previous Purchases'].median()
# print(median)

# mode = df['Previous Purchases'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Previous Purchases'].fillna(df['Previous Purchases'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Previous Purchases'].fillna(df['Previous Purchases'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Previous Purchases'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Previous Purchases'].fillna(df['Previous Purchases'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Previous Purchases')

In [None]:
# print('null value in Previous Purchasesc* == {}'.format(df['Previous Purchases']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Previous Purchasesc*'].fillna(df['Previous Purchasesc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Previous Purchasesc*'].fillna(df['Previous Purchasesc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Previous Purchasesc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Previous Purchasesc*'].fillna(df['Previous Purchasesc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Previous Purchases] == {}'.format(df['Previous Purchases'].isnull().sum()))
# print('\n---\n')
# print('null value in Previous Purchasesc* == {}'.format(df['Previous Purchasesc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Previous Purchases] == {df['Previous Purchases'].dtype}')
# print('\n---\n')
# print(f'The dtype of Previous Purchasesc* is {df['Previous Purchasesc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Previous Purchasesc*'])
# plt.title('Distribution Plot for Previous Purchasesc*')  ## Title for the plot
# plt.xlabel('Previous Purchasesc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Previous Purchasesc*'],orient='h')
# plt.title('Box Plot for Previous Purchasesc*')  ## Title for the plot
# plt.xlabel('Previous Purchasesc*')  ## X-axis label
# plt.ylabel('Previous Purchasesc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Payment Method</font>**

In [None]:
print(f"{df['Payment Method'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Payment Method'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Payment Method'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Payment Method] == {}'.format(df['Payment Method'].isnull().sum()))

In [None]:
print(f'unique values in [Payment Method] == {df['Payment Method'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Payment Method'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Payment Method'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Payment Method'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Payment Method'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Payment Method'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Payment Method'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Payment Method = dict(zip(key,values)) # label data in 'key': value pair
# label['Payment Method'] = Payment Method # store data label dict in main dict actually it's updating/

In [None]:
# df['Payment Method'].replace(Payment Method, inplace=True)
# print('Modified DataFrame:')
# print(df['Payment Method'].head(10))

In [None]:
df['Payment Method'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Payment Method'])
plt.title('Distribution Plot for Payment Method')  # Title for the plot
plt.xlabel('Payment Method Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Payment Method'],orient='h')
plt.title('Box Plot for Payment Method')  # Title for the plot
plt.xlabel('Payment Method')  # X-axis label
plt.ylabel('Payment Method values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Payment Method'], inplace=True)

In [None]:
# mean = df['Payment Method'].mean()
# print(mean)

# median = df['Payment Method'].median()
# print(median)

# mode = df['Payment Method'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Payment Method'].fillna(df['Payment Method'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Payment Method'].fillna(df['Payment Method'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Payment Method'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Payment Method'].fillna(df['Payment Method'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Payment Method')

In [None]:
# print('null value in Payment Methodc* == {}'.format(df['Payment Method']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Payment Methodc*'].fillna(df['Payment Methodc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Payment Methodc*'].fillna(df['Payment Methodc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Payment Methodc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Payment Methodc*'].fillna(df['Payment Methodc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Payment Method] == {}'.format(df['Payment Method'].isnull().sum()))
# print('\n---\n')
# print('null value in Payment Methodc* == {}'.format(df['Payment Methodc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Payment Method] == {df['Payment Method'].dtype}')
# print('\n---\n')
# print(f'The dtype of Payment Methodc* is {df['Payment Methodc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Payment Methodc*'])
# plt.title('Distribution Plot for Payment Methodc*')  ## Title for the plot
# plt.xlabel('Payment Methodc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Payment Methodc*'],orient='h')
# plt.title('Box Plot for Payment Methodc*')  ## Title for the plot
# plt.xlabel('Payment Methodc*')  ## X-axis label
# plt.ylabel('Payment Methodc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)

<hr style="border: 2px solid lime;">

# **<font size=17px color="lightgreen">Frequency of Purchases</font>**

In [None]:
print(f"{df['Frequency of Purchases'].head(5)}\n\n{'*'*21}\n\ntop 5 sorted values\n{df['Frequency of Purchases'].sort_values().head(5)}\n\n{'*'*21}\n\nbottom 5 sorted values\n{df['Frequency of Purchases'].sort_values().tail(5)}")

### <font size=5px color='blue'>data operation</font>

In [None]:
print('null value in [Frequency of Purchases] == {}'.format(df['Frequency of Purchases'].isnull().sum()))

In [None]:
print(f'unique values in [Frequency of Purchases] == {df['Frequency of Purchases'].nunique()}')

In [None]:
print(f'top 5 unique values:\nvalue - count : {df['Frequency of Purchases'].value_counts()[:5]}\n\n{'*'*21}\n\nbottom 5 unique value:\nvalue - count :  {df['Frequency of Purchases'].value_counts()[-5:]}')

#### <font size= 3.5px color='blue'>dtype</font>

In [None]:
df['Frequency of Purchases'].dtype
# format if dtype is wrong

In [None]:
## see non numeric value in column

## Identify non-numeric values using boolean indexing
# non_numeric_value = df['Frequency of Purchases'][numeric_column.isna()]
# print(non_numeric_value)

## Convert the column to numeric, coercing non-numeric values to NaN
# numeric_column = pd.to_numeric(df['Frequency of Purchases'], errors='coerce')



In [None]:
# format opreation
# if dtype is numeric skip this 4 cell


In [None]:
# key = df['Frequency of Purchases'].unique().tolist()
# values = []  ## assign your values here
# print(key)
# print(values)  ## len of key and values should be same

In [None]:
# Frequency of Purchases = dict(zip(key,values)) # label data in 'key': value pair
# label['Frequency of Purchases'] = Frequency of Purchases # store data label dict in main dict actually it's updating/

In [None]:
# df['Frequency of Purchases'].replace(Frequency of Purchases, inplace=True)
# print('Modified DataFrame:')
# print(df['Frequency of Purchases'].head(10))

In [None]:
df['Frequency of Purchases'].unique()

In [None]:
# First plot - Distribution plot
plt.figure(figsize=(10, 7))
sns.distplot(df['Frequency of Purchases'])
plt.title('Distribution Plot for Frequency of Purchases')  # Title for the plot
plt.xlabel('Frequency of Purchases Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label

# Second plot - Boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(df['Frequency of Purchases'],orient='h')
plt.title('Box Plot for Frequency of Purchases')  # Title for the plot
plt.xlabel('Frequency of Purchases')  # X-axis label
plt.ylabel('Frequency of Purchases values')

<font size=3.5px color='bblue'> # type some insight here </font>

#### <font size=3.5px color = 'blue'>clean data here</font>

In [None]:
## for removing null value
# df.dropna(subset=['Frequency of Purchases'], inplace=True)

In [None]:
# mean = df['Frequency of Purchases'].mean()
# print(mean)

# median = df['Frequency of Purchases'].median()
# print(median)

# mode = df['Frequency of Purchases'].mode()
# print(mode)

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Frequency of Purchases'].fillna(df['Frequency of Purchases'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Frequency of Purchases'].fillna(df['Frequency of Purchases'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Frequency of Purchases'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Frequency of Purchases'].fillna(df['Frequency of Purchases'].mode()[0], inplace=True)

#### <font size=3.5px color = 'blue'>outliers</font>

In [None]:
# remove_outliers(df,'Frequency of Purchases')

In [None]:
# print('null value in Frequency of Purchasesc* == {}'.format(df['Frequency of Purchases']c*.isnull().sum()))

In [None]:
## Example 1: Replace missing values in a numerical column with the mean
# df['Frequency of Purchasesc*'].fillna(df['Frequency of Purchasesc*'].mean(), inplace=True)

## Example 2: Replace missing values in a numerical column with the median
# df['Frequency of Purchasesc*'].fillna(df['Frequency of Purchasesc*'].median(), inplace=True)

## Example 3: if **non-numeric**

# df['Frequency of Purchasesc*'].value_counts()

 ## Replace missing values in a categorical column with the mode
# df['Frequency of Purchasesc*'].fillna(df['Frequency of Purchasesc*'].mode()[0], inplace=True)

### **<font size = 5px color = 'lime'>verify the process</font>**

In [None]:
df.head()

In [None]:
print('null value in [Frequency of Purchases] == {}'.format(df['Frequency of Purchases'].isnull().sum()))
# print('\n---\n')
# print('null value in Frequency of Purchasesc* == {}'.format(df['Frequency of Purchasesc*'].isnull().sum()))

In [None]:
print(f'The dtype of [Frequency of Purchases] == {df['Frequency of Purchases'].dtype}')
# print('\n---\n')
# print(f'The dtype of Frequency of Purchasesc* is {df['Frequency of Purchasesc*'].dtype}')

In [None]:
## First plot - Distribution plot
# plt.figure(figsize=(10, 7))
# sns.distplot(df['Frequency of Purchasesc*'])
# plt.title('Distribution Plot for Frequency of Purchasesc*')  ## Title for the plot
# plt.xlabel('Frequency of Purchasesc* Values')  ## X-axis label
# plt.ylabel('Density')  ## Y-axis label

## Second plot - Boxplot
# plt.figure(figsize=(10, 7))
# sns.boxplot(df['Frequency of Purchasesc*'],orient='h')
# plt.title('Box Plot for Frequency of Purchasesc*')  ## Title for the plot
# plt.xlabel('Frequency of Purchasesc*')  ## X-axis label
# plt.ylabel('Frequency of Purchasesc* values')

In [None]:
# more plot ?


<font size=5px color='bblue'> # type some insight here </font>

<hr style="border: 2px solid magenta;">

## [***](#INDEX)