## 1.1 Install Dependencies

In [1]:
%pip install tensorflow opencv-python pandas numpy matplotlib seaborn scikit-learn
%pip install mtcnn face-recognition pillow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Setup plotting
plt.style.use('default')
%matplotlib inline

# Create directories
import os
os.makedirs('results', exist_ok=True)
os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)

print("✅ Setup complete!")
print("📁 Created directories: results/, data/, models/")
print("📦 Installed all required packages")
print("\n🎯 Next: Upload your Excel file to data/ folder")

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Collecting opencv-python
  Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.8 kB 487.6 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 538.3 kB/s eta 0:00:00
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting absl-py>=1.0.0 (from tensor


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Taufik\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for dlib (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [41 lines of output]
      running bdist_wheel
      running build
      running build_ext
      
      
                         CMake is not installed on your system!
      
          Or it is possible some broken copy of cmake is installed on your system.
          It is unfortunately very common for python package managers to include
          broken copies of cmake.  So if the error above this refers to some file
          path to a cmake file inside a python or anaconda or miniconda path then you
          should delete that broken copy of cmake from your computer.
      
          Instead, please get an official copy of cmake from one of these known good
          sources of an official cmake:
              - cmake.org (this is how windows users should get cmake)
              - apt install cmake (for Ubuntu or Debian based systems)
             

Collecting mtcnn
  Using cached mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting face-recognition
  Using cached face_recognition-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting lz4>=4.3.3 (from mtcnn)
  Downloading lz4-4.4.4-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Collecting face-recognition-models>=0.3.0 (from face-recognition)
  Using cached face_recognition_models-0.3.0.tar.gz (100.1 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting Click>=6.0 (from face-recognition)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting dlib>=19.7 (from face-recognition)
  Using cached dlib-20.0.0.tar.gz (3.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (py

## 1.2 Load & Explore Data

In [2]:
print("📊 LOADING DATA...")

# Load your Excel file
excel_path = "data/emotion_data.xlsx"  # SESUAIKAN NAMA FILE ANDA
df = pd.read_excel(excel_path)

print(f"✅ Data loaded successfully!")
print(f"📏 Shape: {df.shape}")
print(f"👥 Unique users: {df['user_id'].nunique()}")

# Display first few rows
print("\n📋 First 5 rows:")
display(df.head())

# Check columns
print(f"\n📊 Columns: {list(df.columns)}")

# Check for missing values
print(f"\n❓ Missing values:")
print(df.isnull().sum())

# Basic stats for emotion columns
emotion_cols = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgusted', 'surprised']
print(f"\n😊 Emotion columns found: {[col for col in emotion_cols if col in df.columns]}")

# Quick preview of emotion distributions
if all(col in df.columns for col in emotion_cols):
    print("\n📈 Emotion Statistics:")
    display(df[emotion_cols].describe())
else:
    print("❌ Some emotion columns are missing!")
    print(f"Available columns: {df.columns.tolist()}")

📊 LOADING DATA...


ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

## 1.3 Engagement Analysis

In [None]:
print("🧮 CALCULATING ENGAGEMENT SCORES...")

# Bobot dari paper (disesuaikan dengan Face API)
emotion_weights = {
    'angry': -0.10,
    'disgusted': -0.05,
    'fearful': -0.20,
    'sad': -0.05,
    'happy': 0.10,
    'surprised': 0.15,
    'neutral': 0.30
}

# Function untuk hitung engagement score
def calculate_engagement_score(row):
    score = 0
    for emotion, weight in emotion_weights.items():
        if emotion in row:
            score += weight * row[emotion]
    return score

# Apply engagement calculation
df['engagement_score'] = df.apply(calculate_engagement_score, axis=1)

# Classify engagement level berdasarkan threshold paper
def classify_engagement(score):
    if score > 0.14:
        return "Highly Engaged"
    elif 0.10 <= score <= 0.14:
        return "Engaged"
    else:
        return "Disengaged"

df['engagement_level'] = df['engagement_score'].apply(classify_engagement)

print("✅ Engagement scores calculated!")
print(f"📊 Score range: {df['engagement_score'].min():.3f} to {df['engagement_score'].max():.3f}")

# Show distribution
engagement_dist = df['engagement_level'].value_counts()
print(f"\n🎯 Engagement Level Distribution:")
for level, count in engagement_dist.items():
    percentage = (count / len(df)) * 100
    print(f"   {level}: {count} ({percentage:.1f}%)")

# Preview results
print(f"\n📋 Sample Results:")
display(df[['user_id', 'timestamp', 'engagement_score', 'engagement_level', 'page']].head(10))

# 1.4 Visualizations

In [None]:
print("🎨 CREATING VISUALIZATIONS...")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Engagement Level Pie Chart
engagement_counts = df['engagement_level'].value_counts()
axes[0,0].pie(engagement_counts.values, labels=engagement_counts.index, autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('Engagement Level Distribution', fontsize=14, fontweight='bold')

# 2. Engagement Score Histogram
axes[0,1].hist(df['engagement_score'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,1].axvline(x=0.10, color='orange', linestyle='--', linewidth=2, label='Engaged threshold')
axes[0,1].axvline(x=0.14, color='red', linestyle='--', linewidth=2, label='Highly Engaged threshold')
axes[0,1].set_xlabel('Engagement Score')
axes[0,1].set_ylabel('Frequency')
axes[0,1].set_title('Engagement Score Distribution')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# 3. Average Emotions Bar Chart
emotion_means = df[emotion_cols].mean()
bars = axes[0,2].bar(emotion_means.index, emotion_means.values, color='lightcoral', alpha=0.8)
axes[0,2].set_xticklabels(emotion_means.index, rotation=45)
axes[0,2].set_title('Average Emotion Distribution')
axes[0,2].set_ylabel('Average Probability')
axes[0,2].grid(True, alpha=0.3)
# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[0,2].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=9)

# 4. User Engagement Heatmap
user_emotion_avg = df.groupby('user_id')[emotion_cols].mean()
im = axes[1,0].imshow(user_emotion_avg.T.values, cmap='viridis', aspect='auto')
axes[1,0].set_title('User-Emotion Heatmap')
axes[1,0].set_xlabel('User Index')
axes[1,0].set_ylabel('Emotions')
axes[1,0].set_yticks(range(len(emotion_cols)))
axes[1,0].set_yticklabels(emotion_cols)
plt.colorbar(im, ax=axes[1,0])

# 5. Per User Engagement Box Plot
user_engagement_scores = []
user_ids = []
for user_id in df['user_id'].unique():
    user_data = df[df['user_id'] == user_id]
    user_engagement_scores.extend(user_data['engagement_score'].tolist())
    user_ids.extend([f"U{user_id}"] * len(user_data))

# Create DataFrame for box plot
box_data = pd.DataFrame({'User': user_ids, 'Engagement': user_engagement_scores})
box_data.boxplot(column='Engagement', by='User', ax=axes[1,1])
axes[1,1].set_title('Engagement Score by User')
axes[1,1].set_xlabel('User ID')
axes[1,1].set_ylabel('Engagement Score')
axes[1,1].tick_params(axis='x', rotation=45)

# 6. Challenge/Page Comparison
if 'page' in df.columns and df['page'].notna().sum() > 0:
    page_engagement = df.groupby('page')['engagement_score'].mean().sort_values(ascending=False)
    bars = axes[1,2].bar(range(len(page_engagement)), page_engagement.values, color='lightgreen', alpha=0.8)
    axes[1,2].set_xticks(range(len(page_engagement)))
    axes[1,2].set_xticklabels([p.split('/')[-1] if '/' in str(p) else str(p) for p in page_engagement.index], rotation=45)
    axes[1,2].set_title('Average Engagement by Challenge')
    axes[1,2].set_ylabel('Average Engagement Score')
    axes[1,2].grid(True, alpha=0.3)
    # Add value labels
    for i, bar in enumerate(bars):
        height = bar.get_height()
        axes[1,2].text(bar.get_x() + bar.get_width()/2., height + 0.005,
                       f'{height:.3f}', ha='center', va='bottom', fontsize=9)
else:
    axes[1,2].text(0.5, 0.5, 'No page data available', ha='center', va='center', fontsize=12)
    axes[1,2].set_title('Challenge Analysis (N/A)')

plt.tight_layout()
plt.savefig('results/engagement_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Visualizations complete!")
print("💾 Saved to: results/engagement_analysis.png")

# 1.5 Summary Statistics

In [None]:
print("📋 GENERATING SUMMARY REPORT...")
print("=" * 60)

# Basic Statistics
total_records = len(df)
unique_users = df['user_id'].nunique()
unique_pages = df['page'].nunique() if 'page' in df.columns else 0

print(f"📊 DATASET OVERVIEW:")
print(f"   📏 Total Records: {total_records:,}")
print(f"   👥 Unique Users: {unique_users}")
print(f"   📚 Unique Pages/Challenges: {unique_pages}")
print(f"   📈 Records per User: {total_records/unique_users:.1f}")

# Engagement Statistics
engagement_stats = df['engagement_score'].describe()
print(f"\n🎯 ENGAGEMENT SCORE STATISTICS:")
print(f"   📊 Mean: {engagement_stats['mean']:.4f}")
print(f"   📊 Std Dev: {engagement_stats['std']:.4f}")
print(f"   📊 Min: {engagement_stats['min']:.4f}")
print(f"   📊 Max: {engagement_stats['max']:.4f}")
print(f"   📊 Median: {engagement_stats['50%']:.4f}")

# Engagement Level Distribution
level_dist = df['engagement_level'].value_counts(normalize=True) * 100
print(f"\n🏆 ENGAGEMENT LEVEL DISTRIBUTION:")
for level, pct in level_dist.items():
    print(f"   {level}: {pct:.1f}%")

# Top Emotions
top_emotions = df[emotion_cols].mean().sort_values(ascending=False)
print(f"\n😊 TOP EMOTIONS (Average Probability):")
for i, (emotion, value) in enumerate(top_emotions.items()):
    print(f"   {i+1}. {emotion}: {value:.4f}")

# Per User Summary
print(f"\n👥 PER USER ENGAGEMENT SUMMARY:")
user_summary = df.groupby('user_id').agg({
    'engagement_score': ['mean', 'std', 'min', 'max'],
    'engagement_level': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
}).round(4)

user_summary.columns = ['Avg_Score', 'Std_Score', 'Min_Score', 'Max_Score', 'Dominant_Level']
display(user_summary.head(10))

# Per Challenge Summary (if available)
if 'page' in df.columns and df['page'].notna().sum() > 0:
    print(f"\n📚 PER CHALLENGE ENGAGEMENT:")
    challenge_summary = df.groupby('page').agg({
        'engagement_score': ['mean', 'std', 'count'],
        'engagement_level': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
    }).round(4)
    challenge_summary.columns = ['Avg_Score', 'Std_Score', 'Record_Count', 'Dominant_Level']
    display(challenge_summary)

# Correlation Analysis
print(f"\n🔗 EMOTION CORRELATIONS:")
correlation_matrix = df[emotion_cols].corr()
display(correlation_matrix.round(3))

# Export Results
print(f"\n💾 EXPORTING RESULTS...")

# Save processed data
df.to_csv('results/processed_engagement_data.csv', index=False)
print(f"   ✅ Processed data: results/processed_engagement_data.csv")

# Save summary statistics
summary_stats = {
    'total_records': total_records,
    'unique_users': unique_users,
    'engagement_stats': engagement_stats.to_dict(),
    'level_distribution': level_dist.to_dict(),
    'top_emotions': top_emotions.to_dict(),
    'user_summary': user_summary.to_dict(),
}

import json
with open('results/summary_stats.json', 'w') as f:
    json.dump(summary_stats, f, indent=2, default=str)
print(f"   ✅ Summary stats: results/summary_stats.json")

# Save user summary
user_summary.to_csv('results/user_engagement_summary.csv')
print(f"   ✅ User summary: results/user_engagement_summary.csv")

if 'page' in df.columns and df['page'].notna().sum() > 0:
    challenge_summary.to_csv('results/challenge_engagement_summary.csv')
    print(f"   ✅ Challenge summary: results/challenge_engagement_summary.csv")

print(f"\n🎉 BASELINE ANALYSIS COMPLETE!")
print(f"📁 All results saved to 'results/' folder")
print(f"\n🚀 Ready for next step: CNN Training with FER2013!")

# Show final dataframe preview
print(f"\n📋 FINAL PROCESSED DATA PREVIEW:")
display(df.head())