In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output
import joblib

In [6]:
%pip install  openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Load datasets
try:
    health_data = pd.read_excel('Sleep Dataset.xlsm')
    social_media_data = pd.read_excel('Social Media Usage - Train.xlsm')
except FileNotFoundError:
    print("Error: One or both Excel files not found. Please check file names and paths.")
    raise

# Print available columns for debugging
print("Columns in social_media_data:", social_media_data.columns.tolist())
print("Columns in health_data:", health_data.columns.tolist())

Columns in social_media_data: ['User_ID', 'Age', 'Gender', 'Platform', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day', 'Dominant_Emotion']
Columns in health_data: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps', 'Sleep Disorder']


In [7]:
# Data Preprocessing
health_data['Sleep Disorder'] = health_data['Sleep Disorder'].fillna('None')
health_data['BMI Category'] = health_data['BMI Category'].str.replace('Normal Weight', 'Normal')
social_media_data = social_media_data.dropna()
social_media_data['Dominant_Emotion'] = social_media_data['Dominant_Emotion'].str.strip()

# Feature Engineering
health_data['Health_Score'] = (health_data['Quality of Sleep'] * 0.3 + 
                             (10 - health_data['Stress Level']) * 0.2 +
                             health_data['Physical Activity Level'] * 0.2 +
                             health_data['Daily Steps'] / 10000 * 0.3)

def categorize_usage(minutes):
    if minutes < 60: return 'Low'
    elif minutes < 120: return 'Moderate'
    else: return 'High'

# Find appropriate usage time column
possible_time_columns = ['Daily Minutes', 'Daily_Usage', 'Minutes_Per_Day', 'Time_Spent', 
                        'Usage_Minutes', 'Daily_Usage_Time (minutes)']
usage_column = None
for col in possible_time_columns:
    if col in social_media_data.columns:
        usage_column = col
        break

if usage_column is None:
    raise ValueError("No time usage column found. Available columns: " + str(social_media_data.columns.tolist()))
else:
    print(f"Using column '{usage_column}' for usage categorization")
    social_media_data['Usage_Category'] = social_media_data[usage_column].apply(categorize_usage)

# Merge datasets
merged_data = pd.merge(health_data, social_media_data, 
                      left_on='Person ID', right_on='User_ID', how='inner')

# Debug: Print columns in merged_data
print("Columns in merged_data:", merged_data.columns.tolist())

Using column 'Daily_Usage_Time (minutes)' for usage categorization
Columns in merged_data: ['Person ID', 'Gender_x', 'Age_x', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps', 'Sleep Disorder', 'Health_Score', 'User_ID', 'Age_y', 'Gender_y', 'Platform', 'Daily_Usage_Time (minutes)', 'Posts_Per_Day', 'Likes_Received_Per_Day', 'Comments_Received_Per_Day', 'Messages_Sent_Per_Day', 'Dominant_Emotion', 'Usage_Category']


In [8]:
# Platform Prediction Model
# Adjust column names based on merged_data columns
# Since 'Age' and 'Gender' appear in both datasets, they might be renamed to 'Age_x', 'Age_y', etc.
# We'll use the health_data versions (assuming they're suffixed with '_x' or unsuffixed if one is dropped)
try:
    X_platform = merged_data[['Age', 'Gender', 'Occupation', 'Health_Score']].copy()
except KeyError:
    # Fallback: Try common suffixed versions after merge
    age_col = 'Age' if 'Age' in merged_data.columns else 'Age_x'
    gender_col = 'Gender' if 'Gender' in merged_data.columns else 'Gender_x'
    X_platform = merged_data[[age_col, gender_col, 'Occupation', 'Health_Score']].copy()

y_platform = merged_data['Platform']

le_gender = LabelEncoder()
le_occupation = LabelEncoder()
le_platform = LabelEncoder()

X_platform[gender_col] = le_gender.fit_transform(X_platform[gender_col])
X_platform['Occupation'] = le_occupation.fit_transform(X_platform['Occupation'])
y_platform = le_platform.fit_transform(y_platform)

X_train, X_test, y_train, y_test = train_test_split(X_platform, y_platform, test_size=0.2, random_state=42)

numeric_features = [age_col, 'Health_Score']
categorical_features = [gender_col, 'Occupation']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', 'passthrough', categorical_features)
    ])

platform_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

platform_model.fit(X_train, y_train)

y_pred = platform_model.predict(X_test)
print("Platform Prediction Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le_platform.classes_))

joblib.dump(platform_model, 'platform_predictor.joblib')
joblib.dump(le_platform, 'platform_encoder.joblib')

Platform Prediction Accuracy: 0.16
              precision    recall  f1-score   support

    Facebook       0.00      0.00      0.00        17
   Instagram       0.35      0.29      0.32        24
    LinkedIn       0.00      0.00      0.00         8
    Snapchat       0.25      0.25      0.25         4
    Telegram       0.00      0.00      0.00         4
     Twitter       0.10      0.25      0.15        12
    Whatsapp       0.14      0.17      0.15         6

    accuracy                           0.16        75
   macro avg       0.12      0.14      0.12        75
weighted avg       0.15      0.16      0.15        75



['platform_encoder.joblib']

In [9]:
# Health Impact Prediction Model
# Use the same adjusted column names
try:
    X_health = merged_data[['Age', 'Gender', 'Platform', usage_column, 'Usage_Category']].copy()
except KeyError:
    age_col = 'Age' if 'Age' in merged_data.columns else 'Age_x'
    gender_col = 'Gender' if 'Gender' in merged_data.columns else 'Gender_x'
    X_health = merged_data[[age_col, gender_col, 'Platform', usage_column, 'Usage_Category']].copy()

y_health = (merged_data['Health_Score'] > merged_data['Health_Score'].median()).astype(int)

X_health[gender_col] = le_gender.transform(X_health[gender_col])
X_health['Platform'] = le_platform.transform(X_health['Platform'])
le_usage = LabelEncoder()
X_health['Usage_Category'] = le_usage.fit_transform(X_health['Usage_Category'])

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_health, y_health, test_size=0.2, random_state=42)

numeric_features_h = [age_col, usage_column]
categorical_features_h = [gender_col, 'Platform', 'Usage_Category']

preprocessor_h = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_h),
        ('cat', 'passthrough', categorical_features_h)
    ])

health_model = Pipeline([
    ('preprocessor', preprocessor_h),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

health_model.fit(X_train_h, y_train_h)

y_pred_h = health_model.predict(X_test_h)
print("\nHealth Impact Prediction Accuracy:", accuracy_score(y_test_h, y_pred_h))

joblib.dump(health_model, 'health_predictor.joblib')
joblib.dump(le_usage, 'usage_encoder.joblib')


Health Impact Prediction Accuracy: 0.8133333333333334


['usage_encoder.joblib']