In [None]:
import warnings
warnings.filterwarnings('ignore')
from google.colab import files

# El siguiente archivo solicitado es para habilitar la API de Kaggle en el entorno que está trabajando.
# Este archivo se descarga entrando a su perfíl de Kaggle, en la sección API, presionando donde dice: Create New API Token

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

#Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 64 bytes


In [None]:
!kaggle competitions download -c bike-sharing-demand
!unzip bike-sharing-demand.zip





bike-sharing-demand.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  bike-sharing-demand.zip
replace sampleSubmission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
!kaggle competitions download -c taa-2025-freesound-audio-tagging -f test


404 Client Error: Not Found for url: https://www.kaggle.com/api/v1/competitions/data/download/taa-2025-freesound-audio-tagging/test


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:

df_train_raw = pd.read_csv('train.csv')
df_submission = pd.read_csv('sampleSubmission.csv')

In [None]:
from datetime import datetime
import calendar

def FilledIn(df):
    df_aux = df.copy()
    df_out = pd.DataFrame(columns=df_aux.columns)
    df_aux['datetime'] = pd.to_datetime(df_aux['datetime'])
    df_aux = df_aux.set_index('datetime')

    for year in [2011, 2012]:
      for month in range(12):
        start_date = datetime(year, month+1, 1, 0, 0, 0)
        last_day_of_month = calendar.monthrange(year, month+1)[1]
        end_date = datetime(year, month+1, last_day_of_month, 23, 0, 0)
        # Se agregan las marcas de tiempo que faltan
        df_month = df_aux[start_date:end_date]
        df_month = df_month.resample('H').asfreq()
        # Rellenar los datos faltantes===========

        cat_features = ['season', 'holiday', 'workingday', 'weather']
        num_features = ['temp', 'atemp', 'humidity', 'windspeed', 'count']

        df_month[cat_features] = df_month[cat_features].fillna(method='ffill')

        df_month[num_features] = df_month[num_features].interpolate(method='linear')

        #========================================
        df_month = df_month.reset_index()

        df_out = pd.concat([df_out,df_month])
    df_out = df_out.reset_index(drop=True)
    return df_out


df_train_raw.head()
dias = pd.to_datetime(df_train_raw.datetime).dt.day


train = df_train_raw[(dias >= 1) & (dias <= 15)]

val = df_train_raw[(dias >= 16) & (dias <= 19)]


adropear = ['registered','casual']
df_full_train = FilledIn(train).drop(adropear, axis=1)
df_full_val = FilledIn(val).drop(adropear,axis=1)

df_full_train

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1.0,0.0,0.0,1.0,9.84,14.395,81.0,0.0000,16.0
1,2011-01-01 01:00:00,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0000,40.0
2,2011-01-01 02:00:00,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0000,32.0
3,2011-01-01 03:00:00,1.0,0.0,0.0,1.0,9.84,14.395,75.0,0.0000,13.0
4,2011-01-01 04:00:00,1.0,0.0,0.0,1.0,9.84,14.395,75.0,0.0000,1.0
...,...,...,...,...,...,...,...,...,...,...
8635,2012-12-15 19:00:00,4.0,0.0,0.0,1.0,14.76,17.425,62.0,8.9981,257.0
8636,2012-12-15 20:00:00,4.0,0.0,0.0,2.0,14.76,17.425,57.0,12.9980,201.0
8637,2012-12-15 21:00:00,4.0,0.0,0.0,1.0,14.76,17.425,62.0,8.9981,184.0
8638,2012-12-15 22:00:00,4.0,0.0,0.0,1.0,14.76,17.425,62.0,12.9980,156.0


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Esta clase aplica las transformaciones definidas en el método "transform"
# Todas estas transformaciones están relacionadas con el tipo de datos datetime:
# a) convierte a datetime, y separa en mes, día de la semana y hora.
# Finalmente dropea datetime porque no nos interesa.
class TimeFeatures(BaseEstimator, TransformerMixin):
  def __init__(self):
    self
  def fit(self, X, y=None):
    # X debe ser un DataFrame
    return self
  def transform(self, X):
    X_aux = X.copy()
    X_aux['datetime'] = pd.to_datetime(X_aux['datetime'])
    X_aux['month'] = X_aux['datetime'].dt.month
    X_aux['weekday'] = X_aux['datetime'].dt.weekday
    X_aux['hour'] = X_aux['datetime'].dt.hour
    X_aux = X_aux.drop('datetime', axis=1)
    return X_aux

# Conjunto de características categóricas
cat_features = ['season', 'weather', 'month', 'weekday', 'hour']

# Conjunto de características numéricas
num_features = ['temp', 'atemp', 'humidity', 'windspeed']

# [holiday, workingday] ya son onehot
# A las categóricas le aplicamos OneHotEncoder(), a las numéricas StandardScaler() y al resto nada (incluye datetime)
scaler = ColumnTransformer([('cat', OneHotEncoder(), cat_features),
                            ('num', StandardScaler(), num_features),
                            ], remainder='passthrough')


# Armamos un nuevo pipeline donde aplicamos el scaler definido antes pero primero aplicamos TimeFeatures() que definimos previamente
preprocess_pipe = Pipeline([('timefeatures', TimeFeatures()),
                            ('scaler', scaler)])


## df_full_train y df_full_val son los datos de entrenamiento y validación con los datos faltantes ya completados
df_x_train = df_full_train.copy()
print(df_x_train.shape)
df_x_val = df_full_val.copy()
print(df_x_val.shape)
X_train = preprocess_pipe.fit_transform(df_x_train.drop('count', axis=1)).toarray()
print(X_train.shape)
X_val = preprocess_pipe.transform(df_x_val.drop('count', axis=1)).toarray()
print(X_val.shape)

(8640, 10)
(2304, 10)
(8640, 57)
(2304, 57)


In [None]:
from tensorflow.keras.preprocessing import timeseries_dataset_from_array
import tensorflow as tf

train_datasets = []
val_datasets = []


df_time = pd.to_datetime(df_x_train['datetime'])
train_df = pd.concat([df_time, pd.DataFrame(data=X_train)], axis=1).set_index('datetime')

df_time = pd.to_datetime(df_x_val['datetime'])
val_df = pd.concat([df_time, pd.DataFrame(data=X_val)], axis=1).set_index('datetime')

y_train = df_full_train[['count']]
y_train = y_train.set_index(pd.to_datetime(df_full_train['datetime']))

y_val = df_full_val[['count']]
y_val = y_val.set_index(pd.to_datetime(df_full_val['datetime']))

batch_size = 32
count = 0
seq_length = 48
for year in [2011, 2012]:
    for month in range(12):
        # Indices del mes
        start_date = datetime(year, month+1, 1, 0, 0, 0)
        last_day_of_month = calendar.monthrange(year, month+1)[1]
        end_date = datetime(year, month+1, last_day_of_month, 23, 0, 0)

        X_month = train_df[start_date:end_date]
        y_month = y_train[start_date:end_date]

        dataset = tf.keras.utils.timeseries_dataset_from_array(X_month[:-1] ,None , sequence_length = seq_length, sequence_stride = 1, batch_size=32)
        target_seq = tf.keras.utils.timeseries_dataset_from_array(y_month[1:] ,None , sequence_length = seq_length, sequence_stride = 1, batch_size=32)


        # Fusionamos X e y
        train_seq2seq_ds = tf.data.Dataset.zip((dataset, target_seq))

        if count < 2:
          train_ds_ss48 = train_seq2seq_ds
          count = count + 1
        else:
          train_ds_ss48 = train_ds_ss48.concatenate(train_seq2seq_ds)




        X_month = val_df[start_date:end_date]
        y_month = y_val[start_date:end_date]

        dataset = tf.keras.utils.timeseries_dataset_from_array(X_month[:-1] ,None , sequence_length = seq_length, sequence_stride = 1, batch_size=32)
        target_seq = tf.keras.utils.timeseries_dataset_from_array(y_month[1:] ,None , sequence_length = seq_length, sequence_stride = 1, batch_size=32)


        # Fusionamos X e y
        val_seq2seq_ds = tf.data.Dataset.zip((dataset, target_seq))

        if count < 2:
          val_ds_ss48 = val_seq2seq_ds
          count = count + 1
        else:
          val_ds_ss48 = val_ds_ss48.concatenate(val_seq2seq_ds)

In [None]:
def rmsle(y_true, y_pred):
    # Asegura que los valores sean >= 0
    y_pred = tf.clip_by_value(y_pred, 0.0, tf.reduce_max(y_pred))
    y_true = tf.clip_by_value(y_true, 0.0, tf.reduce_max(y_true))

    return tf.sqrt(tf.reduce_mean(tf.square(tf.math.log1p(y_pred) - tf.math.log1p(y_true))))

In [None]:
def mae_last_timestep(y_true, y_pred):
    y_true_last = tf.cast(y_true[:, -1], tf.float32)
    y_pred_last = tf.cast(y_pred[:, -1], tf.float32)
    return tf.reduce_mean(tf.abs(y_true_last - y_pred_last))

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM,SimpleRNN, Dense, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping



model_lstm_dropout = Sequential([
    LSTM(64, return_sequences=True,dropout=0.2,recurrent_dropout=0.2),
    TimeDistributed(Dense(1, activation='relu'))
])
model_lstm_dropout.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=rmsle,
    metrics=[mae_last_timestep]
)

In [None]:
history_dropout48 = model_lstm_dropout.fit(
    train_ds_ss48  ,
    validation_data=val_ds_ss48,
    epochs=50
)

Epoch 1/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 88ms/step - loss: 2.7087 - mae_last_timestep: 127.8601 - val_loss: 2.1136 - val_mae_last_timestep: 189.4746
Epoch 2/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 81ms/step - loss: 1.7655 - mae_last_timestep: 118.8033 - val_loss: 1.8672 - val_mae_last_timestep: 183.1783
Epoch 3/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 91ms/step - loss: 1.6117 - mae_last_timestep: 114.7076 - val_loss: 1.7328 - val_mae_last_timestep: 178.3071
Epoch 4/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 98ms/step - loss: 1.5373 - mae_last_timestep: 111.7367 - val_loss: 1.6475 - val_mae_last_timestep: 174.2835
Epoch 5/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 83ms/step - loss: 1.4963 - mae_last_timestep: 109.4463 - val_loss: 1.5869 - val_mae_last_timestep: 170.6996
Epoch 6/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
df_test = pd.read_csv('test.csv')

df_test



Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014
...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981


In [None]:
X_test = preprocess_pipe.fit_transform(df_test).toarray()


df_time = pd.to_datetime(df_test['datetime'])
test_df_con_tiempo = pd.concat([df_time, pd.DataFrame(data=X_test)], axis=1).set_index('datetime')




batch_size = 32
count = 0
seq_length = 48
for year in [2011, 2012]:
    for month in range(12):
        # Indices del mes
        start_date = datetime(year, month+1, 1, 0, 0, 0)
        last_day_of_month = calendar.monthrange(year, month+1)[1]
        end_date = datetime(year, month+1, last_day_of_month, 23, 0, 0)

        X_month = test_df_con_tiempo[start_date:end_date]


        dataset = tf.keras.utils.timeseries_dataset_from_array(X_month[:-1] ,None , sequence_length = seq_length, sequence_stride = 1, batch_size=32)



        if count < 1:
          test_ds = dataset
          count = count + 1
        else:
          test_ds = test_ds.concatenate(dataset)






In [None]:
predicciones = model_lstm_dropout.predict(test_ds)

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step


In [None]:
print(len(predicciones), len(predicciones[0]))

5341 48


In [None]:
predicciones = predicciones[:, -1, :]
predicciones = predicciones.flatten()


In [None]:
preds_completas = np.full(len(df_test), np.nan)


start_index = len(df_test) - len(predicciones)
preds_completas[start_index:] = predicciones

In [None]:
preds_series = pd.Series(preds_completas)

preds_series = preds_series.interpolate(method='linear', limit_direction='both')

In [None]:
submission = pd.DataFrame({
    "datetime": df_test["datetime"],
    "count": preds_series
})

submission.to_csv("submission.csv", index=False)