# Setup

In [2]:
import pyspark.pandas as ps
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Main

## Import data

In [17]:
df_activity = ps.read_csv('../data/dailyActivity_merged.csv')
df_intensity = ps.read_csv('../data/dailyIntensities_merged.csv')
df_sleepday = ps.read_csv('../data/sleepDay_merged.csv')



## Inspect data

### Checking head

In [18]:
df_activity.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [19]:
df_intensity.head()

Unnamed: 0,Id,ActivityDay,SedentaryMinutes,LightlyActiveMinutes,FairlyActiveMinutes,VeryActiveMinutes,SedentaryActiveDistance,LightActiveDistance,ModeratelyActiveDistance,VeryActiveDistance
0,1503960366,4/12/2016,728,328,13,25,0.0,6.06,0.55,1.88
1,1503960366,4/13/2016,776,217,19,21,0.0,4.71,0.69,1.57
2,1503960366,4/14/2016,1218,181,11,30,0.0,3.91,0.4,2.44
3,1503960366,4/15/2016,726,209,34,29,0.0,2.83,1.26,2.14
4,1503960366,4/16/2016,773,221,10,36,0.0,5.04,0.41,2.71


In [20]:
df_sleepday.head()

Unnamed: 0,Id,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
0,1503960366,4/12/2016 12:00:00 AM,1,327,346
1,1503960366,4/13/2016 12:00:00 AM,2,384,407
2,1503960366,4/15/2016 12:00:00 AM,1,412,442
3,1503960366,4/16/2016 12:00:00 AM,2,340,367
4,1503960366,4/17/2016 12:00:00 AM,1,700,712


### Checking schema

The `pyspark.pandas.DataFrame`, according to the [docs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.DataFrame.html), "corresponds to pandas DataFrame logically. This holds Spark DataFrame internally.".

So, for checking the schema, we need to use the method [`pyspark.pandas.DataFrame.spark.frame`](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.DataFrame.spark.frame.html?highlight=frame)

In [21]:
df_activity.spark.frame().schema

StructType([StructField('Id', LongType(), True), StructField('ActivityDate', StringType(), True), StructField('TotalSteps', IntegerType(), True), StructField('TotalDistance', DoubleType(), True), StructField('TrackerDistance', DoubleType(), True), StructField('LoggedActivitiesDistance', DoubleType(), True), StructField('VeryActiveDistance', DoubleType(), True), StructField('ModeratelyActiveDistance', DoubleType(), True), StructField('LightActiveDistance', DoubleType(), True), StructField('SedentaryActiveDistance', DoubleType(), True), StructField('VeryActiveMinutes', IntegerType(), True), StructField('FairlyActiveMinutes', IntegerType(), True), StructField('LightlyActiveMinutes', IntegerType(), True), StructField('SedentaryMinutes', IntegerType(), True), StructField('Calories', IntegerType(), True)])

In [22]:
df_intensity.spark.frame().schema

StructType([StructField('Id', LongType(), True), StructField('ActivityDay', StringType(), True), StructField('SedentaryMinutes', IntegerType(), True), StructField('LightlyActiveMinutes', IntegerType(), True), StructField('FairlyActiveMinutes', IntegerType(), True), StructField('VeryActiveMinutes', IntegerType(), True), StructField('SedentaryActiveDistance', DoubleType(), True), StructField('LightActiveDistance', DoubleType(), True), StructField('ModeratelyActiveDistance', DoubleType(), True), StructField('VeryActiveDistance', DoubleType(), True)])

In [23]:
df_sleepday.spark.frame().schema

StructType([StructField('Id', LongType(), True), StructField('SleepDay', StringType(), True), StructField('TotalSleepRecords', IntegerType(), True), StructField('TotalMinutesAsleep', IntegerType(), True), StructField('TotalTimeInBed', IntegerType(), True)])

We could also check the schema using `.dtypes`, although it has less information.

`.info()` may also be used.

In [24]:
df_activity.dtypes

Id                            int64
ActivityDate                 object
TotalSteps                    int32
TotalDistance               float64
TrackerDistance             float64
LoggedActivitiesDistance    float64
VeryActiveDistance          float64
ModeratelyActiveDistance    float64
LightActiveDistance         float64
SedentaryActiveDistance     float64
VeryActiveMinutes             int32
FairlyActiveMinutes           int32
LightlyActiveMinutes          int32
SedentaryMinutes              int32
Calories                      int32
dtype: object

In [25]:
df_activity.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDate              940 non-null    object 
 2   TotalSteps                940 non-null    int32  
 3   TotalDistance             940 non-null    float64
 4   TrackerDistance           940 non-null    float64
 5   LoggedActivitiesDistance  940 non-null    float64
 6   VeryActiveDistance        940 non-null    float64
 7   ModeratelyActiveDistance  940 non-null    float64
 8   LightActiveDistance       940 non-null    float64
 9   SedentaryActiveDistance   940 non-null    float64
 10  VeryActiveMinutes         940 non-null    int32  
 11  FairlyActiveMinutes       940 non-null    int32  
 12  LightlyActiveMinutes      940 non-null    int32  
 13  SedentaryMinutes          940 non-null    int32  
 14  Calorie

In [28]:
df_intensity.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 940 entries, 0 to 939
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDay               940 non-null    object 
 2   SedentaryMinutes          940 non-null    int32  
 3   LightlyActiveMinutes      940 non-null    int32  
 4   FairlyActiveMinutes       940 non-null    int32  
 5   VeryActiveMinutes         940 non-null    int32  
 6   SedentaryActiveDistance   940 non-null    float64
 7   LightActiveDistance       940 non-null    float64
 8   ModeratelyActiveDistance  940 non-null    float64
 9   VeryActiveDistance        940 non-null    float64
dtypes: float64(4), int32(4), int64(1), object(1)

In [29]:
df_sleepday.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 413 entries, 0 to 412
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Id                  413 non-null    int64 
 1   SleepDay            413 non-null    object
 2   TotalSleepRecords   413 non-null    int32 
 3   TotalMinutesAsleep  413 non-null    int32 
 4   TotalTimeInBed      413 non-null    int32 
dtypes: int32(3), int64(1), object(1)

## Treat data

Sadly, the only treatment that was needed was for the dates in each dataframe.

In [38]:
df_activity['ActivityDate'] = ps.to_datetime(df_activity['ActivityDate'], format='%m/%d/%Y')

In [39]:
df_intensity['ActivityDay'] = ps.to_datetime(df_intensity['ActivityDay'], format='%m/%d/%Y')

In [40]:
df_sleepday['SleepDay'] = ps.to_datetime(df_sleepday['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')

## Analyze