In [4]:
import json
import requests
import datetime
import pandas as pd
import numpy as np
import boto3
from io import StringIO

## Puxando dados e criando df

In [5]:
# Inicializar o cliente S3
s3_client = boto3.client('s3')

# Nome do bucket e arquivo CSV
bucket_name = 'dados-clima'
filename = 'dados_historicos/20240101-20241231.csv'

# Baixar o arquivo CSV do S3
response = s3_client.get_object(Bucket=bucket_name, Key=filename)

# Ler o conte√∫do do arquivo CSV
csv_data = response['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(csv_data))

## Realizando an√°lise explorat√≥ria dos dados (parcial) 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   time                  8784 non-null   object 
 1   temperature_2m        8784 non-null   float64
 2   relative_humidity_2m  8784 non-null   int64  
 3   apparent_temperature  8784 non-null   float64
 4   precipitation         8784 non-null   float64
 5   rain                  8784 non-null   float64
 6   weather_code          8784 non-null   int64  
 7   cloud_cover           8784 non-null   int64  
 8   wind_direction_10m    8784 non-null   int64  
 9   wind_speed_10m        8784 non-null   float64
 10  is_day                8784 non-null   int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 755.0+ KB


In [7]:
df.isnull().sum()

time                    0
temperature_2m          0
relative_humidity_2m    0
apparent_temperature    0
precipitation           0
rain                    0
weather_code            0
cloud_cover             0
wind_direction_10m      0
wind_speed_10m          0
is_day                  0
dtype: int64

In [8]:
df.head()

Unnamed: 0,time,temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,rain,weather_code,cloud_cover,wind_direction_10m,wind_speed_10m,is_day
0,2024-01-01 00:00:00,20.9,82,21.4,0.0,0.0,3,100,127,15.7,0
1,2024-01-01 01:00:00,20.7,83,21.5,0.0,0.0,3,100,129,13.8,0
2,2024-01-01 02:00:00,20.5,83,21.5,0.0,0.0,3,100,133,11.7,0
3,2024-01-01 03:00:00,20.4,84,21.7,0.0,0.0,3,100,135,10.2,0
4,2024-01-01 04:00:00,20.3,86,21.3,2.4,1.9,63,100,131,12.5,0


In [9]:
df.shape

(8784, 11)

In [10]:
df['time'] = pd.to_datetime(df['time'])

# Features √∫teis derivadas de time:
df['hour'] = df['time'].dt.hour
df['day_of_week'] = df['time'].dt.dayofweek  # 0=segunda, 6=domingo
df['month'] = df['time'].dt.month

# Vari√°veis c√≠clicas (opcional e muito √∫til para regress√£o):
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)

df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

## Dividindo as variaveis caracteristicas(x) da variavel alvo(y) 

In [11]:
x = df.drop(columns=[
    'temperature_2m',  # vari√°vel alvo
    'time',            # datetime original (se ainda estiver)
    'hour', 'day_of_week', 'month'  # substitu√≠das pelas vari√°veis c√≠clicas
])
y = df['temperature_2m']

In [12]:
x

Unnamed: 0,relative_humidity_2m,apparent_temperature,precipitation,rain,weather_code,cloud_cover,wind_direction_10m,wind_speed_10m,is_day,hour_sin,hour_cos,month_sin,month_cos
0,82,21.4,0.0,0.0,3,100,127,15.7,0,0.000000,1.000000,5.000000e-01,0.866025
1,83,21.5,0.0,0.0,3,100,129,13.8,0,0.258819,0.965926,5.000000e-01,0.866025
2,83,21.5,0.0,0.0,3,100,133,11.7,0,0.500000,0.866025,5.000000e-01,0.866025
3,84,21.7,0.0,0.0,3,100,135,10.2,0,0.707107,0.707107,5.000000e-01,0.866025
4,86,21.3,2.4,1.9,63,100,131,12.5,0,0.866025,0.500000,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,72,25.2,0.0,0.0,3,100,117,10.5,0,-0.965926,0.258819,-2.449294e-16,1.000000
8780,75,24.7,0.0,0.0,1,6,114,10.6,0,-0.866025,0.500000,-2.449294e-16,1.000000
8781,80,23.8,0.0,0.0,0,0,115,7.6,0,-0.707107,0.707107,-2.449294e-16,1.000000
8782,81,23.6,0.0,0.0,0,0,117,6.4,0,-0.500000,0.866025,-2.449294e-16,1.000000


In [13]:
y

0       20.9
1       20.7
2       20.5
3       20.4
4       20.3
        ... 
8779    23.7
8780    23.1
8781    21.9
8782    21.6
8783    21.0
Name: temperature_2m, Length: 8784, dtype: float64

## Para evitar que o modelo de Regress√£o linear(especificamente) sofram com a difere√ßa de escala, vamos normalizar os dados

‚úÖ Quando Normalizar/Padronizar as vari√°veis
Voc√™ deve normalizar ou padronizar se for usar modelos que dependem da escala dos dados, como:

Redes neurais (LSTM, GRU, MLP, etc.)

Regress√£o linear / ridge / lasso

SVM

KNN

PCA

K-means (se for clustering)

üìå Normaliza√ß√£o (MinMaxScaler): coloca os valores entre 0 e 1. üìå Padroniza√ß√£o (StandardScaler): transforma os dados para m√©dia 0 e desvio padr√£o 1.

‚ùå Quando N√ÉO precisa normalizar
Modelos baseados em √°rvores n√£o precisam de normaliza√ß√£o:

Decision Tree

Random Forest

XGBoost

LightGBM

CatBoost

Eles s√£o invariantes √† escala porque dividem os dados com base em thresholds, e n√£o dependem de dist√¢ncias ou coeficientes lineares.

In [14]:
# normalizador
from sklearn.preprocessing import StandardScaler

# normaliza√ß√£o dos dados
min_max_scaler = StandardScaler()
x = min_max_scaler.fit_transform(x)

Agora os dados est√£o prontos para serem separados no conjunto de treino e 
teste, com 70% dos dados para o treinamento e 30% para teste:

In [15]:
# m√©tricas 
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.metrics import r2_score

from sklearn.svm import SVR 
from sklearn.model_selection import GridSearchCV 

In [16]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) 

# Definindo os par√¢metros a serem ajustados 
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} 

# Criando o modelo 
svr = SVR() 

# Ajuste fino com valida√ß√£o cruzada 
clf = GridSearchCV(svr, parameters, cv=10) 

# Treinando o modelo com otimiza√ß√£o 
clf.fit(x_train, y_train) 

In [17]:
print(clf.best_params_) 

{'C': 10, 'kernel': 'rbf'}


In [18]:
# Obt√©m o melhor modelo encontrado pelo GridSearchCV
best_model = clf.best_estimator_

# Faz previs√µes no conjunto de teste
y_pred = best_model.predict(x_test)

# MAE - Erro M√©dio Absoluto
mae = mean_absolute_error(y_test, y_pred)

# RMSE - Raiz do Erro Quadr√°tico M√©dio
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# R¬≤ - Coeficiente de Determina√ß√£o
r2 = r2_score(y_test, y_pred)

mape = mean_absolute_percentage_error(y_test,y_pred) 

print(f"MAE (Erro M√©dio Absoluto): {mae:.4f}")
print(f"RMSE (Raiz do Erro Quadr√°tico M√©dio): {rmse:.4f}")
print(f"R¬≤ (Coeficiente de Determina√ß√£o): {r2:.4f}")
print(f'MAPE (Percentual do erro m√©dio absoluto): {mape}')

MAE (Erro M√©dio Absoluto): 0.0950
RMSE (Raiz do Erro Quadr√°tico M√©dio): 0.1589
R¬≤ (Coeficiente de Determina√ß√£o): 0.9990
MAPE (Percentual do erro m√©dio absoluto): 0.004189031997725184


1. MAE (Erro M√©dio Absoluto):
MAE = 0.0950
Isso indica que, em m√©dia, o seu modelo comete um erro de 0.095¬∞C nas previs√µes da temperatura. Isso √© muito preciso, considerando o tipo de dado e a aplica√ß√£o.

2. RMSE (Raiz do Erro Quadr√°tico M√©dio):
RMSE = 0.1589
O RMSE de 0.159¬∞C indica que, em m√©dia, a previs√£o est√° a 0.16¬∞C do valor real. Isso tamb√©m √© √≥timo, pois um valor t√£o baixo sugere que os erros do modelo s√£o bem controlados.

3. R¬≤ (Coeficiente de Determina√ß√£o):
R¬≤ = 0.9990
Este valor 0.9990 √© excelente! Significa que o modelo √© capaz de explicar 99.9% da variabilidade da temperatura. O modelo est√° capturando quase toda a varia√ß√£o nos dados, o que √© um excelente resultado.

4. MAPE (Erro Percentual Absoluto M√©dio):
MAPE = 0.0042 (ou 0.42%)
O MAPE de 0.42% √© muito baixo, indicando que a m√©dia do erro percentual √© inferior a 1%. Isso √© extremamente preciso e √© uma √≥tima medida de qu√£o bem o modelo est√° prevendo os valores.

## Salvando o modelo treinado

In [19]:
import joblib

In [20]:
# Salva o modelo treinado
joblib.dump(best_model, 'svr_model.pkl')

['svr_model.pkl']

In [None]:
# Salva o scaler
joblib.dump(min_max_scaler, 'scaler.pkl')

['scaler.pkl']

In [25]:
# Nome do seu bucket e o caminho para os arquivos
bucket_name = 'dados-clima'
model_key = 'modelos/svr_model.pkl'
scaler_key = 'modelos/scaler.pkl'

In [26]:
# Enviando o modelo para o S3
s3_client.upload_file('svr_model.pkl', bucket_name, model_key)
s3_client.upload_file('scaler.pkl', bucket_name, scaler_key)

print(f'Modelo e scaler salvos no S3 em {model_key} e {scaler_key}')

Modelo e scaler salvos no S3 em modelos/svr_model.pkl e modelos/scaler.pkl
