# 🌡️🔥 Day 7 Challenge
## Heatwave of Insights: Mastering Temperature Data with Pandas, Visualization, EDA, and Stats!

### 📂 Dataset:
"https://www.kaggle.com/datasets/sudalairajkumar/daily-temperature-of-major-cities"

In [201]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Pandas

# Q1: Load a dataset containing city temperatures over several years. Group the data by city and calculate the average temperature for each city.

In [204]:
df = pd.read_csv('city_temperature.csv',low_memory=False)
df

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8


In [205]:
avg_temp_city = df.groupby('City')['AvgTemperature'].mean().sort_values(ascending = False)
avg_temp_city

City
Dubai               82.972631
Chennai (Madras)    82.847021
Doha                82.235625
Abu Dhabi           82.192499
Niamey              81.951619
                      ...    
Frankfurt          -13.668786
Lilongwe           -20.585544
Georgetown         -22.101520
Bonn               -46.868050
Bujumbura          -65.397138
Name: AvgTemperature, Length: 321, dtype: float64

# Q2: Create a new column indicating the temperature difference from the previous day for each city.

In [207]:
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
df['Date']

ValueError: cannot assemble the datetimes: day is out of range for month, at position 58204. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
print(df.iloc[58204])

In [None]:
df = df[df['Year'] > 1000]
df = df[df['AvgTemperature'] != -99.0]

In [None]:
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
df['Date']

In [None]:
df = df.sort_values(by = ['City', 'Date'])
df['Temp_diff'] = df.groupby('City')['AvgTemperature'].diff()
df.head(10)

# Q3: Filter out cities where the average temperature over the given period is below 15°C.

In [None]:
city_month_temp_stats = df.groupby(['City', 'Month'])['AvgTemperature'].mean().reset_index(name = 'avg_temp')
cities_below_temp = city_month_temp_stats[city_month_temp_stats['avg_temp'] < 15]
cities_below_temp

# Visualization

# Q1: Create a box plot to compare the temperature distribution across different cities.

In [None]:
top_cities = df.groupby('City')['AvgTemperature'].mean().sort_values(ascending = False).head(10)
top_cities_data = df[df['City'].isin(top_cities.index)]
top_cities_data

plt.figure(figsize = (8,5))
sns.boxplot(data = top_cities_data, x = 'City', y = 'AvgTemperature', color = 'teal')
plt.xticks(rotation=90)
plt.title('Temperature Distribution Across Cities')
plt.show()

# Q2: Plot a line graph showing the temperature trends over time for the top 3 hottest cities.

In [None]:
hottest_city_3 = df.groupby('City')['AvgTemperature'].mean().sort_values(ascending = False).head(3)
hottest_city_3_data = df[df['City'].isin(hottest_city_3.index)]

plt.figure(figsize = (10,5))
sns.lineplot(data = hottest_city_3_data, x = 'Year', y = 'AvgTemperature', hue = 'City', marker = 'o', linewidth=1.5, errorbar=None)
plt.xlabel('Year')
plt.ylabel('Average Temperature (°C)')
plt.legend(title='City', loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=3)

plt.xticks(rotation=45)
plt.show()

# EDA

# Q1: Perform a basic statistical analysis of the temperature data. Report mean, median, standard deviation, and any significant outliers.

In [None]:
print(f"Mean: {round(df['AvgTemperature'].mean(),4)}")
print(f"Median: {round(df['AvgTemperature'].median(),4)}")
print(f"Std. Deviation: {round(df['AvgTemperature'].std(),4)}")

Q1 = df['AvgTemperature'].quantile(0.25)
Q3 = df['AvgTemperature'].quantile(0.75)
IQR = Q3 - Q1
UW = Q3 + 1.5*IQR
LW = Q1 - 1.5*IQR
outliers = (df['AvgTemperature'] < LW) | (df['AvgTemperature'] > UW)
print(f"Outliers: {outliers.sum()}")

# Statistics

# Q1: Perform a one-sample t-test to determine if the average temperature of the hottest city is significantly different from 30°C.

In [None]:
hottest_city = df.groupby('City')['AvgTemperature'].mean().sort_values(ascending = False).head(1)

hottest_city_name = hottest_city.index[0]

hottest_city_data = df[df['City'] == hottest_city_name]

t_stat, p_val = stats.ttest_1samp(hottest_city_data['AvgTemperature'].dropna(), popmean=30)

print(f"Hottest City: {hottest_city_name}")
print(f"t_stat: {t_stat:.4f}")
print(f"p_value: {p_val:.4f}")

alpha = 0.05
if p_val < alpha:
    print("Reject the null hypothesis: The average temperature is significantly different from 30°C.")
else:
    print("Fail to reject the null hypothesis: No significant difference from 30°C.")