# Importing utilities function from utils.py
<br>Loading the dataset.<br>
<br>Initial Review: Utilize head() to view the dataset's first few rows or summary statistics. 
<br>This step helps in identifying any apparent issues with data types or missing values.

In [5]:
import utils
# Load the dataset
file_path = './data/adjusted_retail_sales_data_v2.csv'
sales_data = utils.load_data(file_path)

if sales_data is not None:
    display(sales_data.head())

Unnamed: 0,SalesDate,ProductCategory,SalesAmount,CustomerAge,CustomerGender,CustomerLocation,ProductRatings
0,2023-10-15,Home Appliances,609,22,Male,USA,4
1,2023-09-16,Clothing,1367,22,Female,Australia,5
2,2022-09-06,Electronics,1736,22,Male,UK,2
3,2023-02-24,Female,1838,35,Clothing,India,2
4,2022-09-24,Home Appliances,1829,35,Male,UK,5


# <h5>Initial Exploration : Analyzing the raw dataset.</h5>
<p>It is often beneficial to perform initial exploratory data analysis, such as using <i><b>describe()</b></i> and <i><b>checking for missing values</b></i>
<br>Before making any transformations or filtering the dataset.
<br>This approach allows you to understand the dataset in its raw form and make informed decisions about how to clean and process it.</p>

In [6]:
sales_data = utils.convert_data_types(sales_data,
                                 date_cols=['SalesDate'],
                                 numeric_cols=['SalesAmount','CustomerAge' , 'ProductRatings'] ,
                                 categorical_cols=['ProductCategory' , 'CustomerGender','CustomerLocation'])

if sales_data is not None:
    utils.describe_statistics(sales_data) # Perform initial descriptive statistics analysis

     # Additional Categorical Analysis
    print("\nCategorical Columns Analysis:")

    utils.analyze_categorical_columns(sales_data, 
                                ['ProductCategory', 'CustomerGender', 'CustomerLocation'])  # Analyze categorical columns

    # Check and print missing values
    columns_with_missing_values  = utils.check_and_print_missing_values(sales_data)
    
    # Conclusions from initial exploration
    print("\n--- Initial Exploration Conclusions ---\n")
    if columns_with_missing_values:
        print(f"The dataset contains missing values in the following columns: {', '.join(columns_with_missing_values)}")
    else:
        print("There are no missing values in the dataset.")

    # Analyze potential outliers in numeric columns
    utils.analyze_outliers(sales_data, ['SalesAmount', 'CustomerAge', 'ProductRatings'])


Initial Descriptive Statistics:
                         SalesDate ProductCategory   SalesAmount  CustomerAge  \
count                          997            1000  9.940000e+02  1000.000000   
unique                         NaN               4           NaN          NaN   
top                            NaN     Electronics           NaN          NaN   
freq                           NaN             357           NaN          NaN   
mean    2023-01-11 20:46:27.562688             NaN  2.867501e+03    29.177000   
min            2022-01-02 00:00:00             NaN  2.200000e+01     2.000000   
25%            2022-07-03 00:00:00             NaN  5.200000e+02    22.000000   
50%            2023-01-26 00:00:00             NaN  9.870000e+02    22.000000   
75%            2023-07-16 00:00:00             NaN  1.428000e+03    35.000000   
max            2023-12-31 00:00:00             NaN  1.875000e+06   200.000000   
std                            NaN             NaN  5.944267e+04    11.680844

# <h2>Data Cleaning and Transformation Steps</h2>

<p><strong>1. Handling Missing Values:</strong></p>
<ul>
  <li><em>SalesAmount</em>: Impute missing values with the median or remove rows if missing values are not randomly distributed.</li>
  <li><em>ProductRatings</em>: Impute missing values with the median or remove rows.</li>
</ul>

<p><strong>2. Addressing Outliers:</strong></p>
<ul>
  <li><em>SalesAmount</em>: Identify and handle outliers using methods like IQR. Options include capping, replacing, or removing these values.</li>
  <li><em>ProductRatings</em>: Values outside the range of 1 to 5 should be corrected or removed.</li>
</ul>

<p><strong>3. Correcting Data Types:</strong></p>
<ul>
  <li>Convert <em>SalesDate</em> to datetime format.</li>
  <li>Ensure <em>SalesAmount</em> and <em>ProductRatings</em> are numeric.</li>
</ul>

<p><strong>4. Ensuring Data Consistency:</strong></p>
<ul>
  <li>Standardize categories in <em>ProductCategory</em>, <em>CustomerGender</em>, and <em>CustomerLocation</em>.</li>
</ul>

<p><strong>5. Filtering Inconsistent Data:</strong></p>
<ul>
  <li>Remove rows with invalid categories in <em>ProductCategory</em>, <em>CustomerGender</em>, or <em>CustomerLocation</em>.</li>
</ul>

<p><strong>6. Feature Engineering (if applicable):</strong></p>
<ul>
  <li>Create new features like month or year from <em>SalesDate</em>.</li>
</ul>


In [7]:
# ... [Code for data cleaning and transformation]
if sales_data is not None:
        # Assuming sales_data is already loaded and initial exploration is done
    
    # Handling Missing Values
    sales_data['SalesAmount'].fillna(sales_data['SalesAmount'].median(), inplace=True)
    sales_data['ProductRatings'].fillna(sales_data['ProductRatings'].median(), inplace=True)
    
    # Remove outliers from SalesAmount using the handle_outliers function
    sales_data = utils.handle_outliers_IQR(sales_data, 'SalesAmount')

    
    # Correcting Data Types and Ensuring Data Consistency
    sales_data = utils.convert_data_types(sales_data, date_cols=['SalesDate'], numeric_cols=['SalesAmount', 'ProductRatings'])
    
    # Feature Engineering
    sales_data['Month'] = sales_data['SalesDate'].dt.month
    sales_data['Year'] = sales_data['SalesDate'].dt.year
    
    # Filtering Inconsistent Data
    sales_data = utils.filter_data(sales_data, 'ProductCategory', ['Clothing', 'Electronics', 'Home Appliances'])
    sales_data = utils.filter_data(sales_data, 'CustomerGender', ['Male', 'Female', 'Non-binary'])
    sales_data = utils.filter_data(sales_data, 'CustomerLocation', ['Japan', 'Australia', 'India', 'USA', 'UK', 'Canada'])
    sales_data = utils.filter_data(sales_data, 'ProductRatings', [1, 2, 3, 4, 5])
    
    # Exploration After Cleaning
    # Final check for missing values and outliers
    print("\nMissing Values After Cleaning:")
    missing_values_post_cleaning = utils.check_missing_values(sales_data)
    print(missing_values_post_cleaning)
    
    print("\nDescriptive Statistics After Cleaning:")
    print(utils.describe_data(sales_data, ['SalesAmount', 'ProductRatings']))


    # Example Conclusions from Post-Cleaning Exploration
    print("\n--- Post-Cleaning Exploration Conclusions ---")
    if missing_values_post_cleaning.sum() == 0:
        print("Missing values have been successfully addressed.")
    else:
        print("There are still missing values that need further attention.")
    
 # Analyze potential outliers in numeric columns
    utils.analyze_outliers(sales_data, ['SalesAmount', 'CustomerAge', 'ProductRatings'])


Missing Values After Cleaning:
SalesDate           3
ProductCategory     0
SalesAmount         0
CustomerAge         0
CustomerGender      0
CustomerLocation    0
ProductRatings      0
Month               3
Year                3
dtype: int64

Descriptive Statistics After Cleaning:
       SalesAmount  ProductRatings
count   971.000000      971.000000
mean    982.764161        2.941298
std     535.368536        1.387594
min      22.000000        1.000000
25%     520.000000        2.000000
50%     987.000000        3.000000
75%    1428.000000        4.000000
max    1994.000000        5.000000

--- Post-Cleaning Exploration Conclusions ---
There are still missing values that need further attention.

--- Potential Outliers Analysis ---
The 'SalesAmount' column may contain outliers as indicated by a high max/standard deviation ratio.
The 'CustomerAge' column may contain outliers as indicated by a high max/standard deviation ratio.
The 'ProductRatings' column may contain outliers as indicate

# <h5>To analyze the patterns like average sales per product category, age distribution of customers, and typical product ratings from your dataset, we can use Pandas to group and aggregate the data. Let's break down the analysis into three parts:</h5>
<ol>
<li ><b>Average Sales per Product Category:</b> This will show how sales vary across different product categories.</li>
<li><b>Age Distribution of Customers:</b> This will give insights into the demographic spread of the customers, which is vital for understanding your customer base.</li>
<li><b>Typical Product Ratings: </b>This will reveal how products are rated on average, which can be indicative of product performance and customer satisfaction.</li>
</ol>




In [4]:
average_sales_category = sales_data.groupby('ProductCategory')['SalesAmount'].mean()
print("Average Sales per Product Category:")
print(average_sales_category)

age_distribution = sales_data['CustomerAge'].value_counts().sort_index()
print("Age Distribution of Customers:")
print(age_distribution)

average_product_ratings = sales_data['ProductRatings'].mean()
print("Average Product Ratings:")
print(average_product_ratings)

Average Sales per Product Category:
ProductCategory
Clothing           976.212698
Electronics        986.174785
Female                    NaN
Home Appliances    985.609121
Name: SalesAmount, dtype: float64
Age Distribution of Customers:
CustomerAge
2        1
20       1
22     493
35     467
95       1
100      1
101      1
105      2
112      1
120      1
180      1
200      1
Name: count, dtype: int64
Average Product Ratings:
2.94129763130793


  average_sales_category = sales_data.groupby('ProductCategory')['SalesAmount'].mean()
