This is an exploratory data analysis on individual household electric power consumption by using the UC Irvine Machine Learning Repository
- R language compiler
- R base graphic devices
- raw data set: Individual household electric power consumption
- CODEBOOK.md: step-by-step on how to get from the raw data to the tidy data set.
- analysis.R: a script able to process and clean the raw data into a fine tidy data set
- plotn.R: the scripts that produces the plots in images folder
In order for the analysis.R script to work, you need to download the raw data set library and extract as *.txt file inside the project root folder.
- Setting the search interval
initialDate <- as.Date("01/02/2007", format = "%d/%m/%Y")
finalDate <- as.Date("02/02/2007", format = "%d/%m/%Y")
- Loading the data into data variable
hpc <- read.table(file = "household_power_consumption.txt",
sep=";",
na.strings = "?",
header = TRUE,
colClasses = c('character',
'character',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric'))
object.size(hpc)
str(hpc$Date)
class(hpc$Date)
- Removing NAs
hpc <- hpc[complete.cases(hpc),]
- Interpreting DATE from STRING character
hpc$Date <- as.Date(hpc$Date, format = "%d/%m/%Y")
head(hpc$Date)
- Subsetting the raw data into date interval
shpc <- subset(hpc, Date >= initialDate & Date <= finalDate)
- Combining Date and Time columns into a vector
dateTime <- paste(shpc$Date, shpc$Time)
dateTime <- setNames(dateTime, "DateTime")
dateTime <- as.POSIXct(dateTime)
- Removing Date and Time from data frame and adding dateTime as substitute
shpc <- shpc[ ,!(names(shpc) %in% c("Date","Time"))]
shpc <- cbind(dateTime, shpc)
- Checking the new tidy data set
head(shpc$Date)
tail(shpc$Date)
str(shpc)
- Removing old dataframe cache
rm(hpc)
- Saving the tidy data set into a csv file
write.csv(shpc, "tidyHouseholdPowerConsumption.csv")
- Import csv from the root directory
shpc <- read.table("tidyHouseholdPowerConsumption.csv",
sep=";",
header = TRUE,
colClasses = c('POSIXct',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric'))
- Create histogram & see current device
par(mfrow = c(1, 1))
hist(shpc$Global_active_power,
xlab = "Global Active Power (kilowats)",
main = "Global Active Power",
col = "red")
dev.cur()
- Saving the plot and closing the device
dev.copy(png, "./images/plot1.png", height = 480, width = 480);
dev.off()
- Import csv from the root directory
shpc <- read.table("tidyHouseholdPowerConsumption.csv",
sep=";",
header = TRUE,
colClasses = c('POSIXct',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric'))
dateTime <- as.POSIXct(shpc$dateTime)
- Generate plot
par(mfrow = c(1, 1))
with(shpc, plot(Global_active_power ~ dateTime,
type = "l",
main = "Global Active Power",
xlab = "",
ylab = "Global Active Power (kilowats)"))
- Saving the plot and closing the device
dev.copy(png, "./images/plot2.png", height = 480, width = 480)
dev.off()
- Import csv from the root directory
shpc <- read.table("tidyHouseholdPowerConsumption.csv",
sep=";",
header = TRUE,
colClasses = c('POSIXct',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric'))
- Setting main variables
dateTime <- as.POSIXct(shpc$dateTime)
sub1 <- shpc$Sub_metering_1
sub2 <- shpc$Sub_metering_2
sub3 <- shpc$Sub_metering_3
- Plot
par(mfrow = c(1, 1))
with(shpc, {
plot(sub1 ~ dateTime, type = "l", ylab = "Global Active Power (kW)", xlab = "")
lines(sub2 ~ dateTime, col = "red")
lines(sub3 ~ dateTime, col = "blue")
})
legend("topright", col = c("black", "red", "blue"), lwd = 1,
c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"))
- Saving file
dev.copy(png, file="./images/plot3.png", height=480, width=480)
dev.off()
- Import csv from the root directory
shpc <- read.table("tidyHouseholdPowerConsumption.csv",
sep=";",
header = TRUE,
colClasses = c('POSIXct',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric',
'numeric'))
- Setting variables
dateTime <- as.POSIXct(shpc$dateTime)
sub1 <- shpc$Sub_metering_1
sub2 <- shpc$Sub_metering_2
sub3 <- shpc$Sub_metering_3
- Plotting
par(mar = c(4, 4, 2, 1), mfrow = c(2, 2))
with(shpc, {
plot(Global_active_power~dateTime,
type = "l",
ylab = "Global Active Power (kW)",
xlab = "")
plot(Voltage ~ dateTime,
type = "l",
ylab = "Voltage (V)",
xlab = "")
plot(sub1 ~ dateTime,
type = "l",
ylab = "Global Active Power (kW)",
xlab = "")
lines(sub2 ~ dateTime, col = 'red')
lines(sub3 ~ dateTime,col = 'blue')
legend("topright", col = c("black", "red", "blue"),
lty = 1,
lwd = 2,
bty = "n",
legend = c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"))
plot(Global_reactive_power ~ dateTime,
type = "l",
ylab = "Global Rective Power (kW)",
xlab = "")
})
- Saving file
dev.copy(png, file = "./images/plot4.png", height = 480, width = 480)
dev.off()