# Data Preprocessing, Exploration & Visualizations

In [None]:
data <- read.csv("../input/supermarket-sales/supermarket_sales - Sheet1.csv")
str(data)

The feature invoice id is of no use for us. So let us drop the feature. We also need to check if there are any missing values in the data set. Additionally lets take a look at the summary statistics of the same. Additionally, lets convert the Date to a standardized format. 

In [None]:
library(lubridate)
data_removed <- data[,-1]
data_removed$Date <- as.Date(data_removed$Date, "%m/%d/%y")
year(data_removed$Date) <- 2019
sapply(data_removed, function(x) sum(is.na(x)))
summary(data_removed)

Lets try and arrive at some notable insights. First lets take a look into the branches customers rated high/low.

In [None]:
library(ggplot2)
library(gridExtra)

ggplot(data = data_removed, mapping = aes(x = Branch, y = Rating)) + geom_boxplot(notch = TRUE, mapping = aes(fill = Branch)) +
theme_linedraw() + ggtitle("Box Plot Showing Relationship between Branch and Ratings") + xlab("Branch")+ ylab("Rating")+
geom_hline(mapping = aes(yintercept = 7.1), linetype = "dashed") + geom_hline(mapping = aes(yintercept = 6.7), linetype = "dashed") +
geom_text(mapping = aes(x = "A", y = 7.5, label = "Yangon")) + geom_text(mapping = aes(x = "B", y = 7.5, label = "Mandalay")) + geom_text(mapping = aes(x = "C", y = 7.5, label = "Naypyitaw")) +
scale_y_continuous(breaks = c(4,5.3,5.6,6.7,7.1,8.2,8.5,10))

p <- ggplot(data = data_removed, mapping = aes(x = Branch)) + geom_bar(mapping = aes(fill = Customer.type)) + 
theme_linedraw() + ggtitle("Distribution of Customers in Each Branch Based on Gender") + xlab("Branch") + ylab("Number of Customers") 
p + facet_wrap(data_removed$Gender) + scale_y_continuous(breaks = c(0, 25, 50, 75, 100, 125, 150, 175)) + labs(caption = "Female(Members) - 80/85/96 \n Male(Members) - 87/80/73")

g <- ggplot(data = data_removed, mapping = aes(x = Branch)) + geom_bar(mapping = aes(fill = Customer.type)) + 
theme_linedraw() + ggtitle("Distribution of Customers in Each Branch Based on Payment Mode")+ xlab("Branch") + ylab("Number of Customers") 
g + facet_wrap(data_removed$Payment) + scale_y_continuous(breaks = c(0, 25, 50, 75, 100, 125, 150, 175)) + labs(caption = "Cash(Members) - 56/53/59 \n CreditCard(Members) - 49/63/60 \n Ewallet(Members) - 62/49/50")

In [None]:
library(dplyr)
total_sales_per_day <- data.frame(xtabs(formula=Total~Date, data=data_removed))
total_sales_per_day$Date <- as.Date(total_sales_per_day$Date)

ggplot(data = total_sales_per_day, mapping = aes(x = Date, y = Freq))+ geom_line()+
theme_linedraw()+ ggtitle("Time Series Relationship for the Total Sales per day")+ xlab("Date")+ ylab("Total Sales Per Day")

In [None]:
library(gridExtra)
A <- data_removed %>% filter(Branch == "A")
total_A <- data.frame(xtabs(formula = Total~Date, data = A))
total_A$Date <- as.Date(total_A$Date)
B <- data_removed %>% filter(Branch == "B")
total_B <- data.frame(xtabs(formula = Total~Date, data = B))
total_B$Date <- as.Date(total_B$Date)
C <- data_removed %>% filter(Branch == "C")
total_C <- data.frame(xtabs(formula = Total~Date, data = C))
total_C$Date <- as.Date(total_C$Date)

plot1 <- ggplot(data = total_A, mapping = aes(x = Date, y = Freq))+ geom_line()+
theme_linedraw()+ ggtitle("Time Series Relationship for the Total Sales per day in Branch A")+ xlab("Date")+ ylab("Total Sales Per Day")
plot2 <- ggplot(data = total_B, mapping = aes(x = Date, y = Freq))+ geom_line()+
theme_linedraw()+ ggtitle("Time Series Relationship for the Total Sales per day in Branch B")+ xlab("Date")+ ylab("Total Sales Per Day")
plot3 <- ggplot(data = total_C, mapping = aes(x = Date, y = Freq))+ geom_line()+
theme_linedraw()+ ggtitle("Time Series Relationship for the Total Sales per day in Branch C")+ xlab("Date")+ ylab("Total Sales Per Day")

grid.arrange(plot1, plot2, plot3)

In [None]:
Jan_sales <- data_removed %>% mutate(month = month(Date)) %>% filter(month == 1)
p1 <- ggplot(data = Jan_sales, mapping = aes(x = Product.line, y = Total))+ geom_boxplot(mapping = aes(fill = Product.line), outlier.color = "red", show.legend = FALSE) +
scale_x_discrete(labels = c("Elc", "Fsh", "Fod", "Hel", "Hme", "Spr")) +
theme_linedraw() + ggtitle("January Sales Based on Product Lines") + xlab("Product Lines") + ylab("Total Sales")

Feb_sales <- data_removed %>% mutate(month = month(Date)) %>% filter(month == 2)
p2 <- ggplot(data = Feb_sales, mapping = aes(x = Product.line, y = Total))+ geom_boxplot(mapping = aes(fill = Product.line), outlier.color = "red", show.legend = FALSE) +
scale_x_discrete(labels = c("Elc", "Fsh", "Fod", "Hel", "Hme", "Spr")) +
theme_linedraw() + ggtitle("February Sales Based on Product Lines") + xlab("Product Lines") + ylab("Total Sales")

Mar_sales <- data_removed %>% mutate(month = month(Date)) %>% filter(month == 3)
p3 <- ggplot(data = Mar_sales, mapping = aes(x = Product.line, y = Total))+ geom_boxplot(mapping = aes(fill = Product.line), outlier.color = "red")+
scale_x_discrete(labels = c("Elc", "Fsh", "Fod", "Hel", "Hme", "Spr")) +
theme_linedraw() + ggtitle("March Sales Based on Product Lines") + xlab("Product Lines") + ylab("Total Sales")

grid.arrange(p1,p2,p3) 

In [None]:
p1 <- ggplot(data = Jan_sales, mapping = aes(x = Branch, y = Total)) + geom_boxplot(notch = TRUE, mapping = aes(fill = Branch), outlier.color = "red", show.legend = FALSE) +
theme_linedraw() + ggtitle("Branch Wise January Sales") + xlab("Branches") + ylab("Total Sales")
p2 <- ggplot(data = Feb_sales, mapping = aes(x = Branch, y = Total)) + geom_boxplot(notch = TRUE, mapping = aes(fill = Branch), outlier.color = "red", show.legend = FALSE) +
theme_linedraw() + ggtitle("Branch Wise February Sales") + xlab("Branches") + ylab("Total Sales")
p3 <- ggplot(data = Mar_sales, mapping = aes(x = Branch, y = Total)) + geom_boxplot(notch = TRUE, mapping = aes(fill = Branch), outlier.color = "red") +
theme_linedraw() + ggtitle("Branch Wise March Sales") + xlab("Branches") + ylab("Total Sales")

grid.arrange(p1,p2,p3)