# Phase 1: Data Cleaning and Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

#Loading the dataset
dataset_file_path = "C:\\Users\\TazeenQ\\team27_project\\data\\raw\\retail_sales_dataset.csv"
retail_data = pd.read_csv(dataset_file_path)

#Displaying the first couple of rows to check the data
retail_data.head(), retail_data.info()



#Renaming column names
retail_data.rename(columns = lambda x: x.strip().replace(" ", "_").lower(), inplace = True)

#Standardizing data types
#Converting date to datetime
retail_data['date'] = pd.to_datetime(retail_data['date'], errors = 'coerce')

#Handling missing values by replacing blanks with NaN and checking for missing values
retail_data.replace("", np.nan, inplace = True)
missing_summary = retail_data.isnull().sum()

#Dropping rows with date missing
retail_data.dropna(subset = ['date'], inplace = True)

#Filtering data, the dataset has values from 2023 and only 2 rows with data from 2024. I am removing those 2 rows.
retail_data = retail_data[retail_data['date'].dt.year != 2024].reset_index(drop = True)

#Summarizing dataset features
summary_stats = retail_data.describe(include = 'all')

#Choosing numerical columns to normalize
numerical_cols = ['age', 'quantity', 'price_per_unit', 'total_amount']

#Normalizng them
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(retail_data[numerical_cols])

#Creating a new DataFrame for the normalized data
normalized_dataset = retail_data.copy()
normalized_dataset[numerical_cols] = pd.DataFrame(normalized_data, columns = numerical_cols)

#Hot encoding my categorical variables
categorical_columns = ['gender', 'product_category']
encoded_date = pd.get_dummies(normalized_dataset, columns = categorical_columns, drop_first = False) #true for logistic regression (drop beauty category), false for KNN and random forests - multicollinearity

#Saving the normalized dataset to a new file
cleaned_dataset_file_path = "C:\\Users\\TazeenQ\\team27_project\\data\\processed\\processed_retail_sales_dataset.csv"
normalized_dataset.to_csv(cleaned_dataset_file_path, index = False) 

normalized_dataset.info(), normalized_dataset.head()