In [6]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [7]:
dtype = {
    'datetime': 'string',
    'T': 'float32',
    'RRR': 'float32',
}
data = pd.read_csv('data/weather.csv', sep=';', usecols=[1, 23])
data['date_time'] = data.index
data.reset_index(drop=True, inplace=True)

T - температура

RRR - кол-во осадков

In [8]:
data['RRR'] = data['RRR'].fillna(0.0)

In [9]:
df = data.copy()

In [10]:
df['date'] = df['date_time'].apply(lambda x: x.split()[0].strip())
del df['date_time']

In [11]:
df = df[df['T'].notna()]

In [12]:
def replace_str_with_zero(value):
    try:
        np.float64(value)
        return value
    except ValueError:
        return np.float64(0)

# Применяем функцию ко всем значениям в столбце RRR
df['RRR'] = df['RRR'].apply(replace_str_with_zero)

In [13]:
df['temperature'] = df['T'].astype(np.float64)
df['precipitation'] = df['RRR'].astype(np.float64)

del df['T']
del df['RRR']

In [14]:
df = df.groupby('date').agg({'temperature': 'mean', 'precipitation': 'sum'})
df['date'] = df.index
df.reset_index(inplace=True, drop=True)

In [15]:
split_dates = df['date'].str.split('.', expand=True)
split_dates.columns = ['day', 'month', 'year']
split_dates

Unnamed: 0,day,month,year
0,01,01,2006
1,01,01,2007
2,01,01,2008
3,01,01,2009
4,01,01,2010
...,...,...,...
7252,31,12,2019
7253,31,12,2020
7254,31,12,2021
7255,31,12,2022


In [16]:
df[['day', 'month', 'year']] = split_dates
del df['date']

In [17]:
df['day'] = df['day'].astype(np.int64)
df['month'] = df['month'].astype(np.int64)
df['year'] = df['year'].astype(np.int64)

In [18]:
# Предположим, что датафрейм называется df
# Целевая переменная
y = df['temperature']

# Признаки
X = df[['day', 'month', 'year']]

# Разделим данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.model_selection import GridSearchCV

# Задаем параметры для перебора
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_split': [2, 5, 10]
}

# GridSearch
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Лучшая модель
best_model = grid_search.best_estimator_
print(grid_search.best_params_)

# Оценка лучшей модели
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f"Best RMSE: {rmse_best}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits
{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best RMSE: 2.5541286803366168


In [20]:
# Создаем модель случайного леса
model = RandomForestRegressor(n_estimators=200, min_samples_split=2, max_depth=20, random_state=42)

# Обучаем модель
model.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_pred = model.predict(X_test)

# Оценка качества модели (например, RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 2.5541286803366168


In [21]:
pred_d = pd.DataFrame(
    {
        'day': [13],
        'month': [2],
        'year': [2025],
    }
)
model.predict(pred_d)

array([-5.698125])

```

# Сохраняем модель в файл
joblib.dump(model, 'random_forest_model.joblib')

# Загружаем сохранённую модель
loaded_model = joblib.load('random_forest_model.joblib')

# Делаем предсказание на тестовом наборе
predictions = loaded_model.predict(X_test)

```

In [23]:
# save model
joblib.dump(model, './random_forest_model/random_forest_model.joblib')


['./random_forest_model/random_forest_model.joblib']

In [2]:
import joblib
# Загружаем сохранённую модель
loaded_model = joblib.load('random_forest_model.joblib')

In [5]:
import pandas as pd

pred_d = pd.DataFrame(
    {
        'day': [14],
        'month': [8],
        'year': [2015],
    }
)
# Делаем предсказание на тестовом наборе
predictions = loaded_model.predict(pred_d)
predictions


array([17.282253])