In [6]:
import re

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, TFBertModel, TFAutoModelForSequenceClassification
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import torch

In [2]:
df = pd.read_csv('data/weather.csv', sep=';', usecols=[1, 10, 23])
df['date_time'] = df.index
df.reset_index(drop=True, inplace=True)
df['RRR'] = df['RRR'].fillna(0.0)
df

Unnamed: 0,T,N,RRR,date_time
0,-1.3,100%.,3.0,16.12.2024 21:00
1,-2.0,100%.,2.0,16.12.2024 18:00
2,-1.3,100%.,0.0,16.12.2024 15:00
3,-1.0,100%.,0.0,16.12.2024 12:00
4,-0.9,100%.,0.4,16.12.2024 09:00
...,...,...,...,...
67385,-6.6,100%.,0.0,01.02.2005 15:00
67386,-7.1,"90 или более, но не 100%",0.0,01.02.2005 12:00
67387,-8.6,100%.,3.0,01.02.2005 09:00
67388,-8.2,100%.,2.0,01.02.2005 06:00


In [3]:
def replace_str_with_zero(value):
    try:
        np.float64(value)
        return value
    except ValueError:
        return np.float64(0)

# Применяем функцию ко всем значениям в столбце RRR
df['RRR'] = df['RRR'].apply(replace_str_with_zero)
df['temperature'] = df['T'].astype(np.float64)
df['precipitation'] = df['RRR'].astype(np.float64)

del df['T']
del df['RRR']
df

Unnamed: 0,N,date_time,temperature,precipitation
0,100%.,16.12.2024 21:00,-1.3,3.0
1,100%.,16.12.2024 18:00,-2.0,2.0
2,100%.,16.12.2024 15:00,-1.3,0.0
3,100%.,16.12.2024 12:00,-1.0,0.0
4,100%.,16.12.2024 09:00,-0.9,0.4
...,...,...,...,...
67385,100%.,01.02.2005 15:00,-6.6,0.0
67386,"90 или более, но не 100%",01.02.2005 12:00,-7.1,0.0
67387,100%.,01.02.2005 09:00,-8.6,3.0
67388,100%.,01.02.2005 06:00,-8.2,2.0


In [5]:
# df['date_time'] = pd.to_datetime(df['date_time'], dayfirst=True)
# df['date_time'] = df['date_time'].astype(np.int64)
# df['timestamp'] = df['date_time'].apply(lambda x: pd.Timestamp(x))

In [6]:
# del df['date_time']

In [4]:
df

Unnamed: 0,N,date_time,temperature,precipitation
0,100%.,16.12.2024 21:00,-1.3,3.0
1,100%.,16.12.2024 18:00,-2.0,2.0
2,100%.,16.12.2024 15:00,-1.3,0.0
3,100%.,16.12.2024 12:00,-1.0,0.0
4,100%.,16.12.2024 09:00,-0.9,0.4
...,...,...,...,...
67385,100%.,01.02.2005 15:00,-6.6,0.0
67386,"90 или более, но не 100%",01.02.2005 12:00,-7.1,0.0
67387,100%.,01.02.2005 09:00,-8.6,3.0
67388,100%.,01.02.2005 06:00,-8.2,2.0


In [5]:
def prep_n(text):
    text = str(text)
    f = re.findall(r'\d+', text)
    if len(f):
        return np.int32(f[0])
    else:
        return np.int32(0)
df['cloudiness'] = df['N'].apply(lambda x: prep_n(x))

In [6]:
df = df[df['temperature'].notna()]

In [7]:
del df['N']

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Токенизация и подготовка данных  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(df['date_time'].tolist(), truncation=True, padding=True, return_tensors='pt')

# Нормализация целевых значений  
scaler = StandardScaler()
targets = scaler.fit_transform(df[['temperature', 'precipitation', 'cloudiness']].values)

# Разделяем данные на обучающую и тестовую выборки  
X_train, X_test, y_train, y_test = train_test_split(encodings['input_ids'], targets, test_size=0.2, random_state=42)

# Создаем пользовательский датасет  
class WeatherDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return {
            'input_ids': self.inputs[index],
            'targets': torch.tensor(self.targets[index], dtype=torch.float32)
        }

    # Создаем датасеты  
train_dataset = WeatherDataset(X_train, y_train)
test_dataset = WeatherDataset(X_test, y_test)

# Загружаем модель  
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
# Переносим модель на GPU  
model.to(device)

# Определяем оптимизатор  
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Обучение модели  
model.train()
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

for epoch in range(3):  # Количество эпох  
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        targets = batch['targets']

        outputs = model(input_ids=input_ids, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# Сохраняем модель  
model.save_pretrained('./weather_model')
tokenizer.save_pretrained('./weather_tokenizer')

# Функция предсказания  
def predict_weather(date):
    model.eval()
    inputs = tokenizer([date], truncation=True, padding=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits.detach().numpy()
    predicted_values = scaler.inverse_transform(predictions)  # Обратная нормализация  
    return predicted_values

# Пример использования  
predicted_values = predict_weather('2024-01-01')
print(f'Predicted values for 2024-01-01: Temperature: {predicted_values[0][0]}, '
      f'Precipitation: {predicted_values[0][1]}, Cloudiness: {predicted_values[0][2]}')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 25.321746826171875
Epoch 2, Loss: -8.736536979675293
Epoch 3, Loss: -16.22756004333496
Predicted values for 2024-01-01: Temperature: -974.53369140625, Precipitation: -11.743810653686523, Cloudiness: -270.8409118652344


In [10]:
# Разделение данных на тренировочный и тестовый наборы  
X_train, X_test, y_train, y_test = train_test_split(
    df[['date_time']], df[['temperature', 'precipitation', 'cloudiness']], test_size=0.2, random_state=42
)

# Стандартизация целевых переменных  
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.transform(y_test)

# Создание пользовательского датасета  
class WeatherDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]['date_time']
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

    # Инициализация токенизатора и модели  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Перенос модели на GPU, если доступен  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Создание DataLoader  
train_dataset = WeatherDataset(X_train, y_train_scaled, tokenizer, max_length=10)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Оптимизатор  
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Обучение модели  
model.train()
for epoch in range(8):  # Установите количество эпох  
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.mse_loss(outputs.logits, targets)

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')
    if loss.item() < 0.1:
        break

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.5715553164482117
Epoch 2, Loss: 2.7141001224517822
Epoch 3, Loss: 5.746023178100586
Epoch 4, Loss: 0.035691678524017334


In [11]:
# Сохраняем модель  
model.save_pretrained('./bert_weather_model')
tokenizer.save_pretrained('./bert_weather_model')

('./bert_weather_model/tokenizer_config.json',
 './bert_weather_model/special_tokens_map.json',
 './bert_weather_model/vocab.txt',
 './bert_weather_model/added_tokens.json')

In [12]:
import joblib

# Сохранить  
joblib.dump(scaler, './bert_weather_model/scaler.pkl')

# Затем при необходимости загрузить  
scaler = joblib.load('./bert_weather_model/scaler.pkl')

In [13]:
from transformers import BertTokenizer, BertForSequenceClassification

# Загрузка токенизатора  
tokenizer = BertTokenizer.from_pretrained("./bert_weather_model")

# Загрузка модели  
model = BertForSequenceClassification.from_pretrained("./bert_weather_model")

# Переносим модель на устройство (GPU или CPU)  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [1]:
# Функция для предсказания  
def predict(date):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            date,
            add_special_tokens=True,
            max_length=10,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = scaler.inverse_transform(outputs.logits.cpu().numpy())

        return predictions

In [2]:
# Пример использования  
predicted_values = predict('14.08.2015')
print(f'Predicted Temperature: {predicted_values[0][0]}, Precipitation: {predicted_values[0][1]}, Cloudiness: {predicted_values[0][2]}')

NameError: name 'model' is not defined

In [16]:
def predict_weather(date):
    model.eval()
    inputs = tokenizer([date], truncation=True, padding=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits.argmax(dim=-1).numpy()
    return predictions

# Пример использования  
predicted_values = predict_weather('2024-01-01')
print(f'Predicted values for 2024-01-01: Temperature: {predicted_values[0][0]}, Precipitation: {predicted_values[0][1]}, Cloudiness: {predicted_values[0][2]}')  

IndexError: invalid index to scalar variable.

In [1]:
import torch
print("GPU доступен:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Название устройства:", torch.cuda.get_device_name(0))


GPU доступен: True
Название устройства: AMD Radeon RX 7800 XT


In [2]:
torch.__version__

'2.5.1+rocm6.2'

In [3]:
use_gpu = torch.cuda.is_available()
is_amd = torch.version.hip is not None
device = 'cuda' if use_gpu else 'cpu'

print(f"Using device: {device}")
if is_amd:
    print("Running on AMD GPU with ROCm")
elif use_gpu:
    print("Running on NVIDIA GPU with CUDA")
else:
    print("Running on CPU")


Using device: cuda
Running on AMD GPU with ROCm


In [5]:
import tensorflow as tf
print("GPU доступны:", tf.config.list_physical_devices('GPU'))
print("Собран с поддержкой ROCm:", tf.test.is_built_with_rocm())
print(tf.__version__)



GPU доступны: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Собран с поддержкой ROCm: True
2.16.1
