# **Pandas Profiling**


---



Pandas Profiling streamlines the process by generating a comprehensive report of a dataset, minimizing the time to explore the large datasets.<br>

It is used to generate a complete and exhaustive report for the dataset, with many features and customizations in the generated report. This report includes various pieces of information such as dataset statistics, distribution of values, missing values, memory usage, etc., which are very useful for exploring and analyzing data efficiently. <br>

Pandas Profiling also helps a lot in Exploratory Data Analysis (EDA). EDA is used to understand the underlying structure of data, detect patterns, and generate insights in a visual format.

In [1]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Downloading https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
[2K     [32m/[0m [32m17.8 MB[0m [31m24.5 MB/s[0m [33m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting visions[type_image_path]==0.7.5 (from ydata-profiling==0.0.dev0)
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.7/102.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from ydata-profiling==0.0.dev0)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling==0.0.dev0)
  Downloading phik-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (686 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m686.1/686.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting 

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import re
import pandas_profiling
from pandas_profiling import ProfileReport
from scipy import stats

  import pandas_profiling


In [3]:
# Loading dataset
first_df = pd.read_csv('/content/daily_stats_data.csv')
df = first_df.copy()

In [4]:
# Descriptive Analysis
df.describe()

Unnamed: 0,id,daily_average_heartrate,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,...,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories,total_minutes_asleep,total_sleep_records,total_time_in_bed,bmi,weight_kg,weight_pounds
count,940.0,334.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,...,940.0,940.0,940.0,940.0,410.0,410.0,410.0,67.0,67.0,67.0
mean,4855407000.0,75.974042,7637.910638,5.489702,5.475351,0.108128,1.502681,0.567543,3.340819,0.001606,...,13.564894,192.812766,991.210638,2303.609574,419.173171,1.119512,458.482927,25.185224,72.035821,158.810746
std,2424805000.0,10.340623,5087.150742,3.924606,3.907276,0.619725,2.658941,0.88358,2.040655,0.007346,...,19.987404,109.1747,301.267437,718.166862,118.635918,0.346636,127.45514,3.066962,13.923206,30.695989
min,1503960000.0,57.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,58.0,1.0,61.0,21.45,52.6,115.96
25%,2320127000.0,67.5125,3789.75,2.62,2.62,0.0,0.0,0.0,1.945,0.0,...,0.0,127.0,729.75,1828.5,361.0,1.0,403.75,23.96,61.4,135.36
50%,4445115000.0,75.22,7405.5,5.245,5.245,0.0,0.21,0.24,3.365,0.0,...,6.0,199.0,1057.5,2134.0,432.5,1.0,463.0,24.39,62.5,137.79
75%,6962181000.0,81.9325,10727.0,7.7125,7.71,0.0,2.0525,0.8,4.7825,0.0,...,19.0,264.0,1229.5,2793.25,490.0,1.0,526.0,25.56,85.05,187.5
max,8877689000.0,107.72,36019.0,28.03,28.03,4.94,21.92,6.48,10.71,0.11,...,143.0,518.0,1440.0,4900.0,796.0,3.0,961.0,47.54,133.5,294.32


In [5]:
top_users_perday = pd.DataFrame(columns=['id','activity_date', 'total_steps'])
for date in df['activity_date'].unique():
  daily_data = df[df['activity_date'] == date]
  topusers = daily_data.nlargest(5, 'total_steps')[['activity_date', 'total_steps','id']]
top_users_perday = top_users_perday.append(topusers, ignore_index=True)
print(top_users_perday)

           id activity_date total_steps
0  2022484408    2016-05-12        9117
1  2026352035    2016-05-12        8891
2  8877689391    2016-05-12        8064
3  2873212765    2016-05-12        7566
4  4558609924    2016-05-12        6307


  top_users_perday = top_users_perday.append(topusers, ignore_index=True)


In [6]:
# Highly active Users

active_users = df.sort_values(by=['very_active_minutes', 'fairly_active_minutes'], ascending=False).head(8)
[['id', 'activity_date','very_active_minutes', 'fairly_active_minutes']]
active_users

Unnamed: 0,id,activity_date,daily_average_heartrate,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,...,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories,total_minutes_asleep,total_sleep_records,total_time_in_bed,bmi,weight_kg,weight_pounds
579,5577150313,2016-04-24,69.57,15764,11.78,11.78,0.0,7.65,2.15,1.98,...,65,141,425,4392,543.0,1.0,615.0,,,
585,5577150313,2016-04-30,68.61,12363,9.24,9.24,0.0,5.83,0.79,2.61,...,45,163,621,4501,412.0,1.0,433.0,,,
572,5577150313,2016-04-17,67.47,12231,9.14,9.14,0.0,5.98,0.83,2.32,...,37,159,525,4552,549.0,1.0,583.0,28.0,90.7,199.96
586,5577150313,2016-05-01,71.21,13368,9.99,9.99,0.0,5.31,1.44,3.24,...,72,178,499,4546,379.0,1.0,398.0,,,
50,1624580081,2016-05-01,,36019,28.03,28.03,0.0,21.92,4.19,1.91,...,63,171,1020,2690,,,,,,
571,5577150313,2016-04-16,65.83,14269,10.66,10.66,0.0,6.64,1.28,2.73,...,56,158,472,4274,406.0,1.0,445.0,,,
827,8378563200,2016-04-21,,15148,12.01,12.01,2.25,6.9,0.82,4.29,...,16,145,677,4236,396.0,1.0,417.0,,,
771,8053475328,2016-04-15,,20669,16.24,16.24,0.0,13.26,0.39,2.59,...,8,158,1142,3410,,,,,,


In [7]:
least_steps_per_day = pd.DataFrame(columns=['id', 'activity_date', 'total_steps'])
for date in df['activity_date'].unique():
  daily_data = df[df['activity_date'] == date]
  non_zero_data = daily_data[daily_data['total_steps'] > 0]
  least_steps = non_zero_data.groupby('activity_date').apply(lambda x: x.nsmallest(5, 'total_steps'))[['id','activity_date', 'total_steps']]
least_steps_per_day

Unnamed: 0,id,activity_date,total_steps


In [8]:
abnormal_heart_rate_data = df[(df['daily_average_heartrate'] < 60) | (df['daily_average_heartrate'] > 100)]
result = abnormal_heart_rate_data[['id', 'activity_date', 'daily_average_heartrate']]
print(result)

             id activity_date  daily_average_heartrate
431  4388161847    2016-05-01                    58.37
435  4388161847    2016-05-05                    59.21
440  4388161847    2016-05-10                    59.23
441  4388161847    2016-05-11                    57.87
442  4388161847    2016-05-12                    58.70
500  4558609924    2016-05-08                   100.43
541  5553957443    2016-04-17                    59.20
544  5553957443    2016-04-20                    59.46
568  5577150313    2016-04-13                    58.11
658  6775888955    2016-04-16                   107.72
659  6775888955    2016-04-17                   101.45
661  6775888955    2016-04-19                   106.40
662  6775888955    2016-04-20                   104.67
666  6775888955    2016-04-24                   107.09
673  6775888955    2016-05-01                   105.02


In [9]:
def categorize_activity(row):
  if row['very_active_distance'] > 0 or row['very_active_minutes'] > 30:
    return 'Very Active'
  elif row['moderately_active_distance'] > 0 or row['fairly_active_minutes'] > 30:
      return 'Moderately Active'
  elif row['light_active_distance'] > 0 or row['lightly_active_minutes'] > 30:
        return 'Lightly Active'
  elif row['sedentary_active_distance'] > 0 or row['sedentary_minutes'] > 30:
        return 'Sedentary Active'
  else:
    return 'Inactive'
df['activity_level'] = df.apply(categorize_activity, axis=1)
activity_level_summary = pd.DataFrame(columns=['id', 'activity_level', 'total_steps'])
for user_id in df['id'].unique():
  user_data = df[df['id'] == user_id]
  top_user_per_level = user_data.groupby('activity_level').apply(lambda x: x.nlargest(1, 'total_steps'))[['id', 'activity_level', 'total_steps']]
top_user_per_level

Unnamed: 0_level_0,Unnamed: 1_level_0,id,activity_level,total_steps
activity_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lightly Active,929,8877689391,Lightly Active,4790
Very Active,913,8877689391,Very Active,29326


In [10]:
average_bmi_by_user = df.groupby('id')['bmi'].mean()
mode_activity_by_user = df.groupby('id')['activity_level'].apply(lambda x: x.mode().iat[0])
final_df = pd.DataFrame({'average_bmi': average_bmi_by_user, 'mode_activity_level': mode_activity_by_user})
final_df.reset_index(inplace=True)
final_df

Unnamed: 0,id,average_bmi,mode_activity_level
0,1503960366,22.65,Very Active
1,1624580081,,Lightly Active
2,1644430081,,Very Active
3,1844505072,,Lightly Active
4,1927972279,47.54,Sedentary Active
5,2022484408,,Very Active
6,2026352035,,Lightly Active
7,2320127002,,Lightly Active
8,2347167796,,Very Active
9,2873212765,21.57,Very Active


In [11]:
users_sleep = df[df['total_minutes_asleep'] > 0]
low_mins_sleep = users_sleep.sort_values(by='total_minutes_asleep').head(4)[['id', 'total_minutes_asleep']]
high_mins_sleep = users_sleep.sort_values(by='total_minutes_asleep', ascending=False).head(4)[['id', 'total_minutes_asleep']]
print("User with the Lowest Minutes Slept: ", low_mins_sleep)
print("\nUser with the Highest Minutes Slept: ", high_mins_sleep)

User with the Lowest Minutes Slept:               id  total_minutes_asleep
730  7007744171                  58.0
390  4319703577                  59.0
227  2320127002                  61.0
439  4388161847                  62.0

User with the Highest Minutes Slept:               id  total_minutes_asleep
82   1644430081                 796.0
554  5553957443                 775.0
123  1927972279                 750.0
110  1844505072                 722.0


In [12]:
profile = ProfileReport(df, title="Daily Stats", html={'style' : {'full_width':True}})
profile.to_file(output_file="Daily Activity.html")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]