# Preparación de los datos

In [3]:
# Importaciones necesarias para el correcto funcionamiento de todos lo modelos y demás operaciones con los datos
import pandas as pd 
import numpy as np
import sklearn

In [4]:
# Carga el archivo CSV
data_heartrate_seconds = pd.read_csv("Fitabase Data 4.12.16-5.12.16/heartrate_seconds_merged.csv")
print(data_heartrate_seconds.head())

           Id                  Time  Value
0  2022484408  4/12/2016 7:21:00 AM     97
1  2022484408  4/12/2016 7:21:05 AM    102
2  2022484408  4/12/2016 7:21:10 AM    105
3  2022484408  4/12/2016 7:21:20 AM    103
4  2022484408  4/12/2016 7:21:25 AM    101


In [5]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [6]:
data_heartrate_seconds.isna().sum()

Id       0
Time     0
Value    0
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
mode_imputer = SimpleImputer(strategy = 'most_frequent') # moda

# Sustituyendo aquellos valores Nan con la moda previamente calculada
for column in data_heartrate_seconds.columns:
    values = data_heartrate_seconds[column].values.reshape(-1,1)
    mode_imputer.fit(values)
    data_heartrate_seconds[column] = mode_imputer.transform(values)

# Comprobamos que hayamos eliminadolos atípicos o erroneos.
data_heartrate_seconds.isna().sum()

Id       0
Time     0
Value    0
dtype: int64

In [8]:
data_heartrate_seconds['Time'] = pd.to_datetime(data_heartrate_seconds['Time'])

# Agrupa por ID y fecha redondeada al minuto, calcula la media
result_df = data_heartrate_seconds.groupby(['Id', pd.Grouper(key='Time', freq='1Min')]).mean().reset_index()

result_df.to_csv("Fitabase Data 4.12.16-5.12.16/heartrate_minutes_merged.csv", index=False)

In [9]:
data_intensities_minutes = pd.read_csv("Fitabase Data 4.12.16-5.12.16/minuteIntensitiesNarrow_merged.csv")
print(data_intensities_minutes.head())

data_intensities_minutes.isna().sum()

mode_imputer = SimpleImputer(strategy = 'most_frequent') # moda

for column in data_intensities_minutes.columns:
    values = data_intensities_minutes[column].values.reshape(-1,1)
    mode_imputer.fit(values)
    data_intensities_minutes[column] = mode_imputer.transform(values)

data_intensities_minutes.isna().sum()

           Id         ActivityMinute  Intensity
0  1503960366  4/12/2016 12:00:00 AM          0
1  1503960366  4/12/2016 12:01:00 AM          0
2  1503960366  4/12/2016 12:02:00 AM          0
3  1503960366  4/12/2016 12:03:00 AM          0
4  1503960366  4/12/2016 12:04:00 AM          0


Id                0
ActivityMinute    0
Intensity         0
dtype: int64

In [10]:
# Cargar el archivo CSV de calorías
data_calories = pd.read_csv("Fitabase Data 4.12.16-5.12.16/minuteCaloriesNarrow_merged.csv")

# Muestra el número de valores NaN en cada columna
print(data_calories.isna().sum())

# Realizar la imputación para los valores faltantes en 'calories'
mode_imputer = SimpleImputer(strategy='most_frequent')
for column in data_calories.columns:
    values = data_calories[column].values.reshape(-1, 1)
    mode_imputer.fit(values)
    data_calories[column] = mode_imputer.transform(values)

print(data_calories.isna().sum())

Id                0
ActivityMinute    0
Calories          0
dtype: int64
Id                0
ActivityMinute    0
Calories          0
dtype: int64


A continuacion imprimeros todos los dataset, leídos hasta el momento

In [11]:
print("Ahora imprimeros la pulsaciones por minito (heartrate_seconds_merged.csv.csv) \n")
print(result_df.head())

print("\n")
print("Ahora imprimeros las intensidades por minuto (data_intensities_minutes.csv) \n")
print(data_intensities_minutes.head())
print("\n")

print("Ahora imprimeros la cabecera de (minuteCaloriesNarrow_merged.csv)")
print(data_calories.head())
print("\n")

Ahora imprimeros la pulsaciones por minito (heartrate_seconds_merged.csv.csv) 

           Id                Time       Value
0  2022484408 2016-04-12 07:21:00  101.600000
1  2022484408 2016-04-12 07:22:00   87.888889
2  2022484408 2016-04-12 07:23:00   58.000000
3  2022484408 2016-04-12 07:24:00   58.000000
4  2022484408 2016-04-12 07:25:00   56.777778


Ahora imprimeros las intensidades por minuto (data_intensities_minutes.csv) 

           Id         ActivityMinute  Intensity
0  1503960366  4/12/2016 12:00:00 AM          0
1  1503960366  4/12/2016 12:01:00 AM          0
2  1503960366  4/12/2016 12:02:00 AM          0
3  1503960366  4/12/2016 12:03:00 AM          0
4  1503960366  4/12/2016 12:04:00 AM          0


Ahora imprimeros la cabecera de (minuteCaloriesNarrow_merged.csv)
           Id         ActivityMinute  Calories
0  1503960366  4/12/2016 12:00:00 AM    0.7865
1  1503960366  4/12/2016 12:01:00 AM    0.7865
2  1503960366  4/12/2016 12:02:00 AM    0.7865
3  1503960366  4/12/

In [12]:
data_calories['ActivityMinute'] = pd.to_datetime(data_calories['ActivityMinute'])

In [13]:
merged_df = pd.merge(result_df, data_calories, left_on=['Id', 'Time'], right_on=['Id', 'ActivityMinute'])
print(merged_df)

                Id                Time       Value      ActivityMinute  \
0       2022484408 2016-04-12 07:21:00  101.600000 2016-04-12 07:21:00   
1       2022484408 2016-04-12 07:22:00   87.888889 2016-04-12 07:22:00   
2       2022484408 2016-04-12 07:23:00   58.000000 2016-04-12 07:23:00   
3       2022484408 2016-04-12 07:24:00   58.000000 2016-04-12 07:24:00   
4       2022484408 2016-04-12 07:25:00   56.777778 2016-04-12 07:25:00   
...            ...                 ...         ...                 ...   
333141  8877689391 2016-05-12 13:55:00   60.666667 2016-05-12 13:55:00   
333142  8877689391 2016-05-12 13:56:00   61.875000 2016-05-12 13:56:00   
333143  8877689391 2016-05-12 13:57:00   58.142857 2016-05-12 13:57:00   
333144  8877689391 2016-05-12 13:58:00   61.200000 2016-05-12 13:58:00   
333145  8877689391 2016-05-12 13:59:00   58.000000 2016-05-12 13:59:00   

        Calories  
0        3.32064  
1        3.94326  
2        1.34901  
3        1.03770  
4        1.03770

In [14]:
result_df = merged_df[['Id', 'Time', 'Value', 'Calories']]
result_df.columns = ['Id', 'Time', 'HeartRate', 'Calories']

print(result_df.head())

result_df.to_csv("Fitabase Data 4.12.16-5.12.16/test_train_data.csv", index=False)

           Id                Time   HeartRate  Calories
0  2022484408 2016-04-12 07:21:00  101.600000   3.32064
1  2022484408 2016-04-12 07:22:00   87.888889   3.94326
2  2022484408 2016-04-12 07:23:00   58.000000   1.34901
3  2022484408 2016-04-12 07:24:00   58.000000   1.03770
4  2022484408 2016-04-12 07:25:00   56.777778   1.03770


In [15]:
data_intensities_minutes['ActivityMinute'] = pd.to_datetime(data_intensities_minutes['ActivityMinute'])

merged_df = pd.merge(result_df, data_intensities_minutes, left_on=['Id', 'Time'], right_on=['Id', 'ActivityMinute'])
print(merged_df)

                Id                Time   HeartRate  Calories  \
0       2022484408 2016-04-12 07:21:00  101.600000   3.32064   
1       2022484408 2016-04-12 07:22:00   87.888889   3.94326   
2       2022484408 2016-04-12 07:23:00   58.000000   1.34901   
3       2022484408 2016-04-12 07:24:00   58.000000   1.03770   
4       2022484408 2016-04-12 07:25:00   56.777778   1.03770   
...            ...                 ...         ...       ...   
333141  8877689391 2016-05-12 13:55:00   60.666667   1.33353   
333142  8877689391 2016-05-12 13:56:00   61.875000   1.33353   
333143  8877689391 2016-05-12 13:57:00   58.142857   1.33353   
333144  8877689391 2016-05-12 13:58:00   61.200000   1.33353   
333145  8877689391 2016-05-12 13:59:00   58.000000   1.33353   

            ActivityMinute  Intensity  
0      2016-04-12 07:21:00          1  
1      2016-04-12 07:22:00          1  
2      2016-04-12 07:23:00          0  
3      2016-04-12 07:24:00          0  
4      2016-04-12 07:25:00     

In [16]:
result_df = merged_df[['Id', 'Time', 'HeartRate', 'Intensity', 'Calories']]

print()
print(result_df.head())


           Id                Time   HeartRate  Intensity  Calories
0  2022484408 2016-04-12 07:21:00  101.600000          1   3.32064
1  2022484408 2016-04-12 07:22:00   87.888889          1   3.94326
2  2022484408 2016-04-12 07:23:00   58.000000          0   1.34901
3  2022484408 2016-04-12 07:24:00   58.000000          0   1.03770
4  2022484408 2016-04-12 07:25:00   56.777778          0   1.03770


In [17]:
result_df.to_csv("Fitabase Data 4.12.16-5.12.16/test_train_data.csv", index=False)
print(result_df.head())

           Id                Time   HeartRate  Intensity  Calories
0  2022484408 2016-04-12 07:21:00  101.600000          1   3.32064
1  2022484408 2016-04-12 07:22:00   87.888889          1   3.94326
2  2022484408 2016-04-12 07:23:00   58.000000          0   1.34901
3  2022484408 2016-04-12 07:24:00   58.000000          0   1.03770
4  2022484408 2016-04-12 07:25:00   56.777778          0   1.03770
