<a href="https://colab.research.google.com/github/vaanchhitbaranwal-ux/vaanchhit/blob/main/Fitness_Progress_tracker_smart_device_case_study_bellabeat_casestudy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This R environment comes with many helpful analytics packages installed
# loading library

library(tidyverse) # metapackage of all tidyverse packages
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# loading data
activity <- read.csv("/kaggle/input/fitbitdata/dailyActivity_merged.csv")
calories <- read.csv("/kaggle/input/fitbitdata/hourlyCalories_merged.csv")
intensities <- read.csv("/kaggle/input/fitbitdata/hourlyIntensities_merged.csv")
sleep <- read.csv("/kaggle/input/fitbitdata/sleepDay_merged.csv")
weight <- read.csv("/kaggle/input/fitbitdata/weightLogInfo_merged.csv")

In [None]:
head(activity)

In [None]:
# Check structure of activity dataset
str(activity)

# Check structure of calories dataset
str(calories)

# Check structure of intensities dataset
str(intensities)

# Check structure of sleep dataset
str(sleep)

# Check structure of weight dataset
str(weight)

In [None]:

# Summary statistics for activity dataset
summary(activity)

# Summary statistics for calories dataset
summary(calories)

# Summary statistics for intensities dataset
summary(intensities)

# Summary statistics for sleep dataset
summary(sleep)

# Summary statistics for weight dataset
summary(weight)

In [None]:

# Check for missing values in activity dataset
colSums(is.na(activity))

# Check for missing values in calories dataset
colSums(is.na(calories))

# Check for missing values in intensities dataset
colSums(is.na(intensities))

# Check for missing values in sleep dataset
colSums(is.na(sleep))

# Check for missing values in weight dataset
colSums(is.na(weight))

In [None]:
# Check for duplicates in activity dataset
sum(duplicated(activity))

# Check for duplicates in calories dataset
sum(duplicated(calories))

# Check for duplicates in intensities dataset
sum(duplicated(intensities))

# Check for duplicates in sleep dataset
sum(duplicated(sleep))

# Check for duplicates in weight dataset
sum(duplicated(weight))

In [None]:

# Detect outliers in TotalSteps using IQR method
Q1 <- quantile(activity$TotalSteps, 0.25)
Q3 <- quantile(activity$TotalSteps, 0.75)
IQR_steps <- Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR_steps
upper_bound <- Q3 + 1.5 * IQR_steps

# Filter out outliers
activity_clean <- activity %>%
  filter(TotalSteps >= lower_bound & TotalSteps <= upper_bound)

# Check if outliers have been removed
boxplot(activity_clean$TotalSteps, main="Boxplot for Total Steps (After Outlier Removal)", col="lightblue")

In [None]:
# Remove duplicates from sleep dataset
sleep_clean <- sleep %>% distinct()

# Verify that duplicates are removed
sum(duplicated(sleep_clean))

In [None]:

# Summarize total steps and total calories per day
activity_summary <- activity_clean %>%
  group_by(ActivityDate) %>%
  summarize(total_steps = sum(TotalSteps, na.rm = TRUE),
            total_calories = sum(Calories, na.rm = TRUE))  # Ensure Calories is the correct column name

# Convert ActivityDate to Date format
activity_summary$ActivityDate <- as.Date(activity_summary$ActivityDate)

# Remove rows with missing values
activity_summary <- activity_summary %>% na.omit()

# Visualize total steps over time
ggplot(activity_summary, aes(x = ActivityDate, y = total_steps)) +
  geom_line(color = "blue") +
  labs(title = "Total Steps Per Day", x = "Date", y = "Total Steps")

# Average steps per day
average_steps <- mean(activity_clean$TotalSteps, na.rm = TRUE)
print(paste("Average steps per day: ", round(average_steps, 2)))

In [None]:
# Summarize total sleep time per day
sleep_summary <- sleep_clean %>%
  group_by(SleepDay) %>%
  summarize(total_sleep_minutes = sum(TotalMinutesAsleep))

# Visualize total sleep duration over time
ggplot(sleep_summary, aes(x = SleepDay, y = total_sleep_minutes, group = 1)) +
  geom_line(color = "orange") +
  labs(title = "Total Sleep Duration Per Day", x = "Date", y = "Total Minutes Asleep")

# Average sleep time per user per day
average_sleep <- mean(sleep_clean$TotalMinutesAsleep)
print(paste("Average sleep time per day (minutes): ", round(average_sleep, 2)))

In [None]:

# Summarize total calories burned by hour
calories_summary <- calories %>%
  group_by(ActivityHour) %>%
  summarize(total_calories = sum(Calories))

# Visualize calories burned by hour
ggplot(calories_summary, aes(x = ActivityHour, y = total_calories)) +
  geom_bar(stat="identity", fill="orange") +
  labs(title = "Calories Burned By Hour", x = "Hour of Day", y = "Total Calories Burned")

# Summary of intensities throughout the day
intensities_summary <- intensities %>%
  group_by(ActivityHour) %>%
  summarize(total_intensity = sum(TotalIntensity))

# Visualize intensity levels by hour with proper grouping
ggplot(intensities_summary, aes(x = ActivityHour, y = total_intensity, group = 1)) +
  geom_line(color = "red") +
  labs(title = "Activity Intensity Levels By Hour", x = "Hour of Day", y = "Total Intensity")

In [None]:
# Assuming both datasets have a common column 'Id' (UserID equivalent)
# Merge the weight and activity datasets by 'Id'
merged_data <- merge(weight, activity_clean, by = "Id")

# Check structure of merged data to confirm merge was successful
str(merged_data)

# Calculate the correlation between weight (WeightKg) and total steps (TotalSteps)
correlation_steps_weight <- cor(merged_data
TotalSteps, use = "complete.obs")
print(paste("Correlation between weight and total steps: ", round(correlation_steps_weight, 2)))

# Weight distribution visualization
ggplot(weight, aes(x = WeightKg)) +
  geom_histogram(fill="purple", bins=10) +
  labs(title = "Distribution of User Weights", x = "Weight (Kg)", y = "Count")

In [None]:
# Check column names in weight and activity_clean datasets
colnames(weight)
colnames(activity_clean)

# Merge weight and activity_clean datasets based on a common column
merged_data <- merge(weight, activity_clean, by = "Id")

# Calculate the correlation between weight and total steps in the merged dataset
correlation_steps_weight <- cor(merged_data
TotalSteps, use = "complete.obs")
print(paste("Correlation between weight and total steps: ", round(correlation_steps_weight, 2)))

In [None]:
# Scatter plot for total steps vs calories burned with a modern look
ggplot(activity_clean, aes(x = TotalSteps, y = Calories)) +
  geom_point(color = "dodgerblue", size = 3, alpha = 0.7) + # Modern transparency
  geom_smooth(method = "lm", color = "darkorange", se = FALSE) + # Add regression line
  theme_minimal() + # Minimal theme for clean look
  labs(title = "Total Steps vs Calories",
       subtitle = "Relationship between steps taken and calories burned",
       x = "Total Steps",
       y = "Calories")



ggplot(data=activity, aes(x=TotalSteps, y=Calories)) +
  geom_point() + geom_smooth() + labs(title="Total Steps vs. Calories")

In [None]:
# Scatter plot for total minutes asleep vs total time in bed with a creative design
ggplot(sleep_clean, aes(x = TotalTimeInBed, y = TotalMinutesAsleep)) +
  geom_point(color = "forestgreen", size = 3, alpha = 0.6) + # Transparent green points
  geom_smooth(method = "lm", color = "blue", se = FALSE) + # Blue regression line
  theme_light() + # Clean, light theme
  labs(title = "Total Minutes Asleep vs Total Time in Bed",
       subtitle = "Sleep Efficiency: Time in bed vs actual sleep time",
       x = "Total Time in Bed (minutes)",
       y = "Total Minutes Asleep")

In [None]:
# Merge intensities and calories datasets by 'Id' and 'ActivityHour'
merged_intensity_calories <- merge(intensities, calories, by = c("Id", "ActivityHour"))

# Check structure of the merged dataset
str(merged_intensity_calories)

# Scatter plot for activity intensity vs calories burned
ggplot(merged_intensity_calories, aes(x = TotalIntensity, y = Calories)) +
  geom_point(color = "orange", alpha = 0.6) + # Transparent points for a modern look
  geom_smooth(method = "lm", color = "red", se = FALSE) + # Add regression line
  theme_minimal() + # Modern minimalist theme
  labs(title = "Activity Intensity vs Calories Burned",
       subtitle = "Exploring the correlation between activity intensity and calories burned",
       x = "Total Intensity",
       y = "Calories Burned")

In [None]:
# Merge activity and sleep datasets by 'Id' to analyze the relationship between steps and sleep quality
merged_activity_sleep <- merge(activity_clean, sleep_clean, by = "Id")

# Scatter plot for steps vs total minutes asleep
ggplot(merged_activity_sleep, aes(x = TotalSteps, y = TotalMinutesAsleep)) +
  geom_point(color = "purple", size = 3, alpha = 0.6) + # Transparent purple points
  geom_smooth(method = "lm", color = "darkblue", se = FALSE) + # Regression line
  theme_minimal() + # Minimalist theme
  labs(title = "Total Steps vs Total Minutes Asleep",
       subtitle = "Analyzing the relationship between physical activity and sleep",
       x = "Total Steps",
       y = "Total Minutes Asleep")

In [None]:
# Bar chart for calories burned by time of day
ggplot(calories, aes(x = ActivityHour, y = Calories)) +
  geom_bar(stat = "identity", fill = "lightblue", color = "darkblue", alpha = 0.8) + # Light blue bars
  theme_minimal() + # Minimal theme
  labs(title = "Calories Burned by Time of Day",
       subtitle = "Hourly calories burned throughout the day",
       x = "Hour of Day",
       y = "Calories Burned")
# Calories burned by time of day (hourly)
ggplot(calories, aes(x = ActivityHour, y = Calories)) +
  geom_bar(stat="identity", fill="lightblue") +
  labs(title = "Calories Burned by Time of Day", x = "Hour of Day", y = "Calories Burned")