# FitArena - Exploratory Data Analysis
## Sports Performance Analytics Platform

This notebook performs comprehensive exploratory data analysis on Fitbit activity data to understand athlete performance patterns and prepare features for machine learning models.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

print('Libraries imported successfully!')

In [None]:
# Import custom modules
import sys
sys.path.append('..')

from src.data_processing.data_loader import DataLoader
from src.data_processing.data_validator import DataValidator, DataCleaner

print('Custom modules imported!')

## 1. Data Loading

In [None]:
# Initialize data loader
loader = DataLoader(data_dir='../data')

# Load all datasets
print('Loading all datasets...')
data = loader.load_all_data(folder='Fitabase Data 4.12.16-5.12.16')

print('Datasets loaded:')

In [None]:
# Get data summary
summary = loader.get_data_summary(data)
print('Data Summary:')
summary

## 2. Daily Activity Analysis

In [None]:
# Get daily activity data
daily_activity = data['daily_activity']
print(f'Daily Activity Shape: {daily_activity.shape}')
print(f'Date Range: {daily_activity["ActivityDate"].min()} to {daily_activity["ActivityDate"].max()}')
print(f'Unique Users: {daily_activity["Id"].nunique()}')

# Display first few rows
daily_activity.head()

In [None]:
# Statistical summary
print('Statistical Summary of Daily Activity:')
daily_activity.describe().round(2)

In [None]:
# Check for missing values
print('Missing Values:')
missing = daily_activity.isnull().sum()
missing[missing > 0]

## 3. Data Quality Assessment

In [None]:
# Initialize validator
validator = DataValidator()

# Generate validation report
validation_report = validator.generate_validation_report(
    daily_activity, 
    'daily_activity',
    range_rules={
        'TotalSteps': (0, 50000),
        'Calories': (0, 10000),
        'TotalDistance': (0, 50)
    }
)

print(f'Data Quality Score: {validation_report["quality_score"]}/100')
print(f'Duplicates: {validation_report["duplicates"]["total_duplicates"]} ({validation_report["duplicates"]["duplicate_percentage"]}%)')

## 4. Activity Distribution Analysis

In [None]:
# Distribution of key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Steps distribution
axes[0, 0].hist(daily_activity['TotalSteps'], bins=50, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Total Steps', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Total Steps')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(daily_activity['TotalSteps'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 0].legend()

# Calories distribution
axes[0, 1].hist(daily_activity['Calories'], bins=50, color='lightgreen', edgecolor='black')
axes[0, 1].set_title('Distribution of Calories Burned', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Calories')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(daily_activity['Calories'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 1].legend()

# Distance distribution
axes[1, 0].hist(daily_activity['TotalDistance'], bins=50, color='lightcoral', edgecolor='black')
axes[1, 0].set_title('Distribution of Total Distance', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Distance (km)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(daily_activity['TotalDistance'].mean(), color='red', linestyle='--', label='Mean')
axes[1, 0].legend()

# Sedentary minutes
axes[1, 1].hist(daily_activity['SedentaryMinutes'], bins=50, color='plum', edgecolor='black')
axes[1, 1].set_title('Distribution of Sedentary Minutes', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Sedentary Minutes')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].axvline(daily_activity['SedentaryMinutes'].mean(), color='red', linestyle='--', label='Mean')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = daily_activity.select_dtypes(include=[np.number]).columns
correlation_matrix = daily_activity[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Activity Metrics', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Key correlations
print('Top Positive Correlations:')
corr_pairs = correlation_matrix.unstack()
sorted_pairs = corr_pairs.sort_values(ascending=False)
# Remove self-correlations
sorted_pairs = sorted_pairs[sorted_pairs < 1]
print(sorted_pairs.head(10))

## 6. Temporal Patterns Analysis

In [None]:
# Add temporal features
daily_activity['DayOfWeek'] = daily_activity['ActivityDate'].dt.dayofweek
daily_activity['DayName'] = daily_activity['ActivityDate'].dt.day_name()
daily_activity['IsWeekend'] = daily_activity['DayOfWeek'].isin([5, 6])

# Average activity by day of week
day_avg = daily_activity.groupby('DayName').agg({
    'TotalSteps': 'mean',
    'Calories': 'mean',
    'TotalDistance': 'mean',
    'SedentaryMinutes': 'mean'
}).round(2)

# Reorder days
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_avg = day_avg.reindex(day_order)

print('Average Activity by Day of Week:')
day_avg

In [None]:
# Visualize weekly patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

day_avg['TotalSteps'].plot(kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Average Steps by Day of Week', fontweight='bold')
axes[0, 0].set_ylabel('Steps')
axes[0, 0].tick_params(axis='x', rotation=45)

day_avg['Calories'].plot(kind='bar', ax=axes[0, 1], color='lightgreen')
axes[0, 1].set_title('Average Calories by Day of Week', fontweight='bold')
axes[0, 1].set_ylabel('Calories')
axes[0, 1].tick_params(axis='x', rotation=45)

day_avg['TotalDistance'].plot(kind='bar', ax=axes[1, 0], color='lightcoral')
axes[1, 0].set_title('Average Distance by Day of Week', fontweight='bold')
axes[1, 0].set_ylabel('Distance (km)')
axes[1, 0].tick_params(axis='x', rotation=45)

day_avg['SedentaryMinutes'].plot(kind='bar', ax=axes[1, 1], color='plum')
axes[1, 1].set_title('Average Sedentary Minutes by Day of Week', fontweight='bold')
axes[1, 1].set_ylabel('Minutes')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. User Behavior Segmentation

In [None]:
# Calculate user-level statistics
user_stats = daily_activity.groupby('Id').agg({
    'TotalSteps': ['mean', 'std', 'sum'],
    'Calories': ['mean', 'std', 'sum'],
    'TotalDistance': ['mean', 'std', 'sum'],
    'VeryActiveMinutes': 'mean',
    'FairlyActiveMinutes': 'mean',
    'LightlyActiveMinutes': 'mean',
    'SedentaryMinutes': 'mean',
    'ActivityDate': 'count'
}).round(2)

user_stats.columns = ['_'.join(col).strip() for col in user_stats.columns.values]
user_stats = user_stats.rename(columns={'ActivityDate_count': 'days_tracked'})

print(f'Total Users: {len(user_stats)}')
print(f'User Statistics Summary:')
user_stats.head()

In [None]:
# Classify users by activity level
user_stats['activity_level'] = pd.cut(
    user_stats['TotalSteps_mean'],
    bins=[0, 5000, 7500, 10000, float('inf')],
    labels=['Sedentary', 'Low Active', 'Somewhat Active', 'Active']
)

activity_dist = user_stats['activity_level'].value_counts()
print('User Distribution by Activity Level:')
print(activity_dist)

# Visualize
plt.figure(figsize=(10, 6))
activity_dist.plot(kind='bar', color='teal')
plt.title('Distribution of Users by Activity Level', fontsize=14, fontweight='bold')
plt.xlabel('Activity Level')
plt.ylabel('Number of Users')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 8. Key Insights and Recommendations

### Key Findings:
1. **Activity Patterns**: Users show varying activity levels with distinct patterns
2. **Temporal Trends**: Activity varies by day of week
3. **Strong Correlations**: Steps, distance, and calories are highly correlated
4. **User Segments**: Users can be classified into distinct activity levels

### Recommendations for ML Models:
1. Use temporal features (day of week, weekends) for predictions
2. Create user-level aggregated features for personalization
3. Engineer rolling averages and lag features for time-series prediction
4. Build activity classification models based on user segments
5. Develop anomaly detection for unusual activity patterns

In [None]:
# Save processed data for modeling
daily_activity.to_csv('../data/processed_daily_activity.csv', index=False)
user_stats.to_csv('../data/user_statistics.csv')
print('Processed data saved!')