In [None]:
#importing required packages
import pandas as pd
import numpy as np

In [None]:
# 1. Data Ingestion
file_path = "./src/weather_data.csv"
weather_df = pd.read_csv(file_path)

In [None]:
# Display the first few rows and basic info about the DataFrame to understand its structure and contents
weather_df.head()
weather_df.info()

In [None]:
#2. Data Cleaning and Transformation
#2a. Handle Missing Values, replace missing temperature_celsius values with the average temperature for that city, or drop rows where date is missing

#Handle missing date values, drop raws where date is missing
weather_df = weather_df.dropna(subset=['date'])

#Group the DetaFrame by city for resuse
city_group = weather_df.groupby('city')

#Handle missing humidty values, replace with median for that city 
city_avg_temp = city_group['temperature_celsius'].transform('mean')
weather_df['temperature_celsius'] = weather_df['temperature_celsius'].fillna(city_avg_temp).round(2)

#Handle missing humidty values, replace with median for that city
city_median_humidty = city_group['humidity_percent'].transform('median')
weather_df['humidity_percent'] = weather_df['humidity_percent'].fillna(city_median_humidty).round(2)

#Handle missing wind speed, replace with mean wind speed for that city
city_avg_wind_speed = city_group['wind_speed_kph'].transform('mean')
weather_df['wind_speed_kph'] = weather_df['wind_speed_kph'].fillna(city_avg_wind_speed).round(2)

In [None]:
#2b. Standardize Dates: Convert the date column to a consistent format (e.g., YYYY-MM-DD).
#Function to standardize date formats

def standardize_date(date_str):
    if pd.isna(date_str):
        return pd.NaT

    date_str = str(date_str).strip()

    try:
        #Extract components based on separator
        if '/' in date_str:
            parts = date_str.split('/')
        elif '-' in date_str:
            parts = date_str.split('-')
        elif '.' in date_str:
            parts = date_str.split('.')
        else:
            #Unkown format try default format
            return pd.to_datetime(date_str, errors='coerce')

        #First component is year, so it's in the standard format YYYY-MM-DD
        if len(parts[0]) == 4:
            return pd.to_datetime(date_str)

        #First component is day, so it's DD/MM/YYYY
        elif int(parts[0]) > 12:
            return pd.to_datetime(date_str, dayfirst=True)

        #First component is month, so it's MM/DD/YYY
        elif int(parts[1]) > 12:
            return pd.to_datetime(date_str, dayfirst=False)

        # For truly ambiguous dates (both components <= 12), use heuristics based on the separator:
        # - Dates with '/' often use MM/DD/YYYY in US data
        # - Dates with '-' or '.' often use DD-MM-YYYY in European data
        if '/' in date_str:
            return pd.to_datetime(date_str, dayfirst=False)  # Assume MM/DD/YYYY
        else:
            return pd.to_datetime(date_str, dayfirst=True)   # Assume DD-MM-YYYY or DD.MM.YYYY
        
                
    except Exception as e:
        print(f"Error parsing date '{date_str}' : {e} ")
        return pd.NaT

weather_df['date'] = weather_df['date'].apply(standardize_date)

In [None]:
#2c. Add a New Column: Create a temperature_fahrenheit column by converting temperature_celsius using the formula: F=C×9/5+32
weather_df['temperature_fahrenheit'] = weather_df['temperature_celsius'] * 9/5 + 32
weather_df['temperature_fahrenheit'] = weather_df['temperature_fahrenheit'].round(2)

In [None]:
#2d Filter Data: Keep only rows where weather_condition is not "Unknown" or null.
#Standardize weather condition values(convert to title case), this'll make RAINY and raniy both become Rainy
weather_df['weather_condition'] = weather_df['weather_condition'].str.title()

#Filter out rows where weather_condition is "Unknown" or null
weather_condition_filter = (weather_df['weather_condition'] != "Unknown") & (weather_df['weather_condition'].notna())
#Apply the filter
weather_df = weather_df[weather_condition_filter]

#Checking final values after Data cleaning
weather_df.isnull().sum()

In [None]:
#3 Data Output:
#3a. Save the cleaned and transformed data (including the new temperature_fahrenheit column) as a CSV file named “tranformed_weather_data.csv” under “outputs” folder.

import os

try:
    output_dir = "outputs"
    if not os.path.exists("outputs"):
        os.makedirs(output_dir)
        print(f"Created directory : {output_dir}")
    
    output_file = os.path.join(output_dir, "tranformed_weather_data.csv")
    #Save the dataframe to CSV
    weather_df.to_csv(output_file, index=False)
    print(f"Successfully saved trasformed data to : {output_file}")
    
except Exception as e:
    print(f"Error saving data: {e}")

In [None]:
#3b. Optional: Generate a simple text report (e.g., Markdown or TXT file) listing the top 5 cities with the highest average temperature_celsius.
city_avg_temps = city_group['temperature_celsius'].mean().round(2).reset_index()
city_avg_temps = city_avg_temps.sort_values('temperature_celsius', ascending=False)

report_content = f"""# Cities Ranked by Average Temperature

Analysis based on data from {weather_df['date'].min().strftime('%Y-%m-%d')} to {weather_df['date'].max().strftime('%Y-%m-%d')}

| Rank | City | Average Temperature (°C) |
|------|------|--------------------------|
"""

# Add each city to the report
for i, row in enumerate(city_avg_temps.itertuples(), 1):
    report_content += f"| {i} | {row.city} | {row.temperature_celsius}°C |\n"

#Add some additional information
report_content += f"""
\n## Additional Information

-**Total cities analyzed:** {len(city_avg_temps)}
-**Date range:** {weather_df['date'].min().strftime('%Y-%m-%d')} to {weather_df['date'].max().strftime('%Y-%m-%d')}
-**Total weather records:** {len(weather_df)}

**Report generated on {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}**
"""

report_file = os.path.join(output_dir, "city_temperature_ranking.md")

#Write the report to a file
with open(report_file, 'w') as f:
    f.write(report_content)

print(f"Report successfully generated at : {report_file}")

In [None]:
#Bonus (Optional): Create a bar chart of average temperature per city using Matplotlib or Seaborn and include the image in the repository.
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 11), dpi=100)
bars = plt.bar(city_avg_temps['city'], city_avg_temps['temperature_celsius'], color='#3498db', edgecolor='black', linewidth=1)

# Add labels and title
plt.xlabel('City', fontsize=12, fontweight='bold')
plt.ylabel('Average Temperature (°C)', fontsize=12, fontweight='bold')
plt.title('Average Temperature by City', fontsize=16, fontweight='bold', pad=20)

# Add text labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.3,
            f'{height}°C', ha='center', va='bottom', fontsize=11, fontweight='bold')

# Add a grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.gca().set_axisbelow(True)  # Put the grid behind the bars

# Add a footnote with the data source
plt.figtext(0.5, 0.01, 
           f'Based on weather data from {weather_df["date"].min().strftime("%Y-%m-%d")} to {weather_df["date"].max().strftime("%Y-%m-%d")}',
           ha='center', fontsize=9, style='italic')

# Ensure a tight layout
plt.tight_layout(pad=2.0, rect=[0, 0.03, 1, 0.97])

chart_path = os.path.join(output_dir, "average_temperature_celsius_by_city.png")
plt.savefig(chart_path, dpi=100, bbox_inches='tight')

print(f"Bar chart saved to {chart_path}")