In [491]:
import pandas as pd
df = pd.read_csv('temperature_data.csv')
df['city'].unique()  # Вывод уникальных значений столбца 'city'

array(['New York', 'London', 'Paris', 'Tokyo', 'Moscow', 'Sydney',
       'Berlin', 'Beijing', 'Rio de Janeiro', 'Dubai', 'Los Angeles',
       'Singapore', 'Mumbai', 'Cairo', 'Mexico City'], dtype=object)

In [492]:
import re
import numpy as np

def calculate_mean_std_analyze_anomalies(df):
    #среднее для аномалий
    df['rolling_mean'] = df.groupby(['city'])['temperature'].rolling(window=30).mean().reset_index(level=0, drop=True)
    df['rolling_std'] = df.groupby(['city'])['temperature'].rolling(window=30).std().reset_index(level=0, drop=True)
    # Определение аномалий
    df['is_anomaly'] = (df['temperature'] < df['rolling_mean'] - 2 * df['rolling_std']) | (df['temperature'] > df['rolling_mean'] + 2 * df['rolling_std'])
    return df

def calculate_season_mean_sdt(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']) #переводим в формат даты для того, чтобы сформировать столбец сезонов
    df['season'] = df['timestamp'].dt.month.apply(lambda x: 'winter' if x in [12, 1, 2] else 'spring' if x in [3, 4, 5] else 'summer' if x in [6, 7, 8] else 'autumn')
    #среднее для профиля сезона
    seasonal_stats = df.groupby(['city', 'season'])['temperature'].agg(['mean', 'std']).rename(columns={'mean': 'season_mean', 'std': 'season_std'}).reset_index() 
    df = df.merge(seasonal_stats, on=['city', 'season'], how='left')
    return df

In [493]:
import time 
start_time = time.time()
df = calculate_mean_std_analyze_anomalies(df)
df = calculate_season_mean_sdt(df)
end_time = time.time()
print(f'time: {end_time - start_time} seconds')

time: 0.0746927261352539 seconds


In [478]:
df

Unnamed: 0,city,timestamp,temperature,season,rolling_mean,rolling_std,is_anomaly,season_mean,season_std
0,New York,2010-01-01,4.214168,winter,,,False,0.014277,5.346761
1,New York,2010-01-02,-1.694455,winter,,,False,0.014277,5.346761
2,New York,2010-01-03,-3.534843,winter,,,False,0.014277,5.346761
3,New York,2010-01-04,1.357098,winter,,,False,0.014277,5.346761
4,New York,2010-01-05,3.683931,winter,,,False,0.014277,5.346761
...,...,...,...,...,...,...,...,...,...
54745,Mexico City,2019-12-25,14.466296,winter,11.448959,5.035731,False,11.737176,4.977217
54746,Mexico City,2019-12-26,18.109403,winter,11.533084,5.127465,False,11.737176,4.977217
54747,Mexico City,2019-12-27,11.930446,winter,11.526251,5.126771,False,11.737176,4.977217
54748,Mexico City,2019-12-28,13.996690,winter,11.970870,4.713464,False,11.737176,4.977217


In [None]:
import time 
import asyncio
from unittest import result
import numpy as np



async def calculate_mean_std_analyze_anomalies(df):
    #среднее для аномалий
    df['rolling_mean'] = df.groupby(['city'])['temperature'].rolling(window=30).mean().reset_index(level=0, drop=True)
    df['rolling_std'] = df.groupby(['city'])['temperature'].rolling(window=30).std().reset_index(level=0, drop=True)
    # Определение аномалий
    df['is_anomaly'] = (df['temperature'] < df['rolling_mean'] - 2 * df['rolling_std']) | (df['temperature'] > df['rolling_mean'] + 2 * df['rolling_std'])
    return df

async def calculate_season_mean_sdt(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']) #переводим в формат даты для того, чтобы сформировать столбец сезонов
    df['season'] = df['timestamp'].dt.month.apply(lambda x: 'winter' if x in [12, 1, 2] else 'spring' if x in [3, 4, 5] else 'summer' if x in [6, 7, 8] else 'autumn')
    #среднее для профиля сезона
    seasonal_stats = df.groupby(['city', 'season'])['temperature'].agg(['mean', 'std']).rename(columns={'mean': 'season_mean', 'std': 'season_std'}).reset_index() 
    df = df.merge(seasonal_stats, on=['city', 'season'], how='left')
    return df

start_time = time.time()

results = await asyncio.gather(
    calculate_mean_std_analyze_anomalies(df),
    calculate_season_mean_sdt(df),
)

results = results[0].merge(results[1], how='inner')
end_time = time.time()
print(f'Execution time: {end_time - start_time} sec')

Execution time: 0.12137103080749512 sec


In [485]:
results
#результат работы асинхронной функции, выше время

Unnamed: 0,city,timestamp,temperature,season,rolling_mean,rolling_std,is_anomaly,season_mean,season_std
0,New York,2010-01-01,4.214168,winter,,,False,0.014277,5.346761
1,New York,2010-01-02,-1.694455,winter,,,False,0.014277,5.346761
2,New York,2010-01-03,-3.534843,winter,,,False,0.014277,5.346761
3,New York,2010-01-04,1.357098,winter,,,False,0.014277,5.346761
4,New York,2010-01-05,3.683931,winter,,,False,0.014277,5.346761
...,...,...,...,...,...,...,...,...,...
54745,Mexico City,2019-12-25,14.466296,winter,11.448959,5.035731,False,11.737176,4.977217
54746,Mexico City,2019-12-26,18.109403,winter,11.533084,5.127465,False,11.737176,4.977217
54747,Mexico City,2019-12-27,11.930446,winter,11.526251,5.126771,False,11.737176,4.977217
54748,Mexico City,2019-12-28,13.996690,winter,11.970870,4.713464,False,11.737176,4.977217


In [None]:
#время выполнения при асинхронности выше - тк numpy функции синхронны, в данном случае апдейтится один источник данных, 
# при асинхронном вызовые возвращаются два датафрейма с соотетствующими колонкаими
# которые затем приходится объединять - на это тоже тратятся ресурсы  
# также тратится время на организацию асинхронности
# поэтому тут неудачный случай для применения асинхронности, будем использовать синхронный подход в задании с пприложением

In [494]:
import json
import requests
API_key = ""
def get_weather_data(city):
    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={API_key}&units=metric"
    response = requests.get(url)
    temp = response.json().get('main').get('temp')
    return temp

temp = get_weather_data('London')

In [495]:
import time
def get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'autumn'
    
def analyze_anomaly(city, df): 
   temp = get_weather_data(city) # текущая температура
   print(f'{city} temp is {temp}')
   current_month = time.localtime().tm_mon # текущий месяц
   season = get_season(current_month) # текущий сезон
   print(season)
   city_data = df[(df['city'] == city) & (df['season'] == season)] # фильтруем по городу и сезону
   mean_temp_by_city = city_data['season_mean'].values[0]
   print(f'{city} mean temp is {mean_temp_by_city}')
   season_std_by_city = city_data['season_std'].values[0]
   print(f'{city} std temp is {season_std_by_city}')
   anomaly_flag = False
   if (temp > (mean_temp_by_city + 2 * season_std_by_city)) or (temp < (mean_temp_by_city - 2 * season_std_by_city)):
      anomaly_flag = True
      print(anomaly_flag)
      return anomaly_flag
   else: 
      print(anomaly_flag)
      return anomaly_flag
      
    

In [496]:
%%time
start_time = time.time()
get_weather_data('Paris')
analyze_anomaly('Paris', df)
get_weather_data('London')
analyze_anomaly('London', df)
get_weather_data('Berlin')
analyze_anomaly('Berlin', df)
get_weather_data('New York')
analyze_anomaly('New York', df)
get_weather_data('Mumbai')
analyze_anomaly('Mumbai', df)
get_weather_data('Paris')
analyze_anomaly('Paris', df)
print('time:', time.time() - start_time)

Paris temp is 4.57
winter
Paris mean temp is 4.187665832523104
Paris std temp is 4.827912842348074
False
London temp is 6.22
winter
London mean temp is 4.912100993088853
London std temp is 5.056827932115923
False
Berlin temp is 0.84
winter
Berlin mean temp is 0.06892999600845177
Berlin std temp is 4.81279824780885
False
New York temp is 13.27
winter
New York mean temp is 0.014277392576934408
New York std temp is 5.346761151974429
True
Mumbai temp is 26.99
winter
Mumbai mean temp is 24.803236767146924
Mumbai std temp is 5.177606466194116
False
Paris temp is 4.57
winter
Paris mean temp is 4.187665832523104
Paris std temp is 4.827912842348074
False
time: 6.543658971786499
CPU times: user 95 ms, sys: 16.2 ms, total: 111 ms
Wall time: 6.54 s


In [497]:
#пробуем асинхронный подход
import aiohttp 
async def async_get_weather_data(city):
    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={API_key}&units=metric"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            data = await response.json()
            temp = data.get('main').get('temp')
            return temp

async def async_get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'autumn'

async def async_analyze_anomaly(city):
   temp = await async_get_weather_data(city)
   print(f'{city} temp is {temp}')
   current_month = time.localtime().tm_mon # текущий месяц
   season = await async_get_season(current_month) # текущий сезон
   print(season)
   city_data = df[(df['city'] == city) & (df['season'] == season)]
   mean_temp_by_city = city_data['season_mean'].values.mean()
   print(f'{city} mean temp is {mean_temp_by_city}')
   season_std_by_city = city_data['season_std'].values[0]
   print(f'{city} std temp is {season_std_by_city}')
   anomaly_flag = False
   if (temp > (mean_temp_by_city + 2 * season_std_by_city)) or (temp < (mean_temp_by_city - 2 * season_std_by_city)):
      anomaly_flag = True
      print(anomaly_flag)
      return anomaly_flag
   else: 
      print(anomaly_flag)
      return anomaly_flag

In [498]:
%autoawait
import asyncio
start_time = time.time()
results = await asyncio.gather(
    async_get_weather_data('Paris'),
    async_analyze_anomaly('Paris'),
    async_get_weather_data('London'),
    async_analyze_anomaly('London'),
    async_get_weather_data('Berlin'),
    async_analyze_anomaly('Berlin'),
    async_get_weather_data('New York'),
    async_analyze_anomaly('New York'),
    async_get_weather_data('Mumbai'),
    async_analyze_anomaly('Mumbai'),
    async_get_weather_data('Paris'),
    async_analyze_anomaly('Paris'), 
)
print('time:', time.time() - start_time)

IPython autoawait is `on`, and set to use `asyncio`
Berlin temp is 0.84
winter
Berlin mean temp is 0.06892999600845176
Berlin std temp is 4.81279824780885
False
Paris temp is 4.57
winter
Paris mean temp is 4.187665832523102
Paris std temp is 4.827912842348074
False
Paris temp is 4.57
winter
Paris mean temp is 4.187665832523102
Paris std temp is 4.827912842348074
False
London temp is 6.22
winter
London mean temp is 4.912100993088854
London std temp is 5.056827932115923
False
Mumbai temp is 26.99
winter
Mumbai mean temp is 24.803236767146924
Mumbai std temp is 5.177606466194116
False
New York temp is 13.27
winter
New York mean temp is 0.014277392576934412
New York std temp is 5.346761151974429
True
time: 0.5535132884979248


In [172]:
#асинхронный подход работает в данном случае быстрее, тк API запросы выполняются к OpenWeatherMap параллельно , в примере выше 0.55 секунды против ±6.5 секунды
#плюсы асинхронного подхода очевидны
#стоит иметь ввиду, что применялся асинхронный подход на готовом датасете - в приложении также подготовим перед испольованием

In [499]:
from sklearn.linear_model import LinearRegression
def create_season_profile(city,df):
    df_city = df[df['city']==city]
    seasonal_stats = df_city.groupby('season')['temperature'].agg(
        min_temp='min',
        max_temp='max',
        mean_temp='mean',
        std_temp='std'
    ).reset_index()
    whole_time_mean = df_city['temperature'].mean()
    whole_time_min = df_city['temperature'].min()
    whole_time_max = df_city['temperature'].max()
    
    df_city['timestamp_num'] = df_city['timestamp'].map(pd.Timestamp.toordinal)  # Преобразуем дату в числовой формат
    model = LinearRegression()
    model.fit(df_city[['timestamp_num']], df_city['temperature'])
    trend_slope = model.coef_[0]  # Уклон тренда (положительный или отрицательный)
    
    return {
        'city': city,
        'seasonal_stats': seasonal_stats,
        'whole_time_mean': whole_time_mean,
        'whole_time_min': whole_time_min,
        'whole_time_max': whole_time_max,
        'trend_slope': trend_slope
    }

In [500]:
res = create_season_profile('Paris',df)
res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city['timestamp_num'] = df_city['timestamp'].map(pd.Timestamp.toordinal)  # Преобразуем дату в числовой формат


{'city': 'Paris',
 'seasonal_stats':    season   min_temp   max_temp  mean_temp  std_temp
 0  autumn  -3.248560  28.683066  12.779061  4.897372
 1  spring  -4.679654  29.887790  11.924933  5.100905
 2  summer   2.546362  37.043690  20.026881  5.050846
 3  winter -11.077072  18.131607   4.187666  4.827913,
 'whole_time_mean': np.float64(12.272195522499043),
 'whole_time_min': np.float64(-11.077071831669423),
 'whole_time_max': np.float64(37.04369038922585),
 'trend_slope': np.float64(0.00012074902359475136)}