In [14]:
import pandas as pd
from io import StringIO


In [19]:
# Read CSV
df = pd.read_csv("./regional_data/AP_Max_temp.csv")

# Drop the 'Average' row
df = df[df['Month'] != 'Average']

# Replace '-' with NaN
df.replace('-', pd.NA, inplace=True)


df.head(12)


# Read CSV
df = pd.read_csv("./regional_data/AP_Max_temp.csv")

# Store the Average column values before dropping the Average row
average_values = df[df['Month'] == '**Average**']['Average'].iloc[0] if '**Average**' in df['Month'].values else None

# Drop the 'Average' row
df = df[df['Month'] != '**Average**']

# Replace '-' with the Average column value for each row
for index, row in df.iterrows():
    if row['Average'] != '-':  # Make sure Average column has a valid value
        avg_value = float(row['Average'])
        # Replace '-' in all columns except 'Month' and 'Average'
        for col in df.columns:
            if col not in ['Month', 'Average'] and row[col] == '-':
                df.at[index, col] = avg_value

# Convert numeric columns to float (excluding Month)
numeric_cols = [col for col in df.columns if col != 'Month']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.drop(columns=['Average'], inplace=True)

df.head(13)

Unnamed: 0,Month,2025,2024,2023,2022,2021
0,January,29.3,28.8,28.6,30.0,29.3
1,February,33.1,32.3,32.0,32.0,31.3
2,March,36.5,37.5,33.5,35.7,36.1
3,April,38.5,41.0,34.7,38.3,37.6
4,May,37.1,40.2,37.8,36.1,36.7
5,June,36.8,36.6,37.3,36.0,36.2
6,July,35.2,35.2,34.3,33.5,35.1
7,August,33.3,35.5,35.3,33.6,34.1
8,September,33.7,34.8,34.2,33.1,34.2
9,October,32.3,33.2,33.7,31.1,32.7


In [20]:
df_long = df.melt(id_vars=['Month'], var_name='Year', value_name='Value')

# Convert Value to numeric (NaN will stay)
df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce')


df_long.head(60)

Unnamed: 0,Month,Year,Value
0,January,2025,29.3
1,February,2025,33.1
2,March,2025,36.5
3,April,2025,38.5
4,May,2025,37.1
5,June,2025,36.8
6,July,2025,35.2
7,August,2025,33.3
8,September,2025,33.7
9,October,2025,32.3


In [23]:
df_long = df.melt(id_vars=['Month'], var_name='Year', value_name='Value')

# Convert Value to numeric (NaN will stay)
df_long['Max_Temperature'] = pd.to_numeric(df_long['Value'], errors='coerce')

# Map month names to numbers
month_map = { 
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df_long['MonthNum'] = df_long['Month'].map(month_map)

# Create proper datetime
df_long['Date'] = pd.to_datetime(
    df_long['Year'].astype(str) + '-' + df_long['MonthNum'].astype(str).str.zfill(2) + '-01',
    format='%Y-%m-%d',
    errors='coerce'
)

# Drop rows where Date is NaT (if any)
df_long = df_long.dropna(subset=['Date']).sort_values('Date').reset_index(drop=True)

print(df_long[['Date', 'Max_Temperature']])


         Date  Max_Temperature
0  2021-01-01             29.3
1  2021-02-01             31.3
2  2021-03-01             36.1
3  2021-04-01             37.6
4  2021-05-01             36.7
5  2021-06-01             36.2
6  2021-07-01             35.1
7  2021-08-01             34.1
8  2021-09-01             34.2
9  2021-10-01             32.7
10 2021-11-01             28.6
11 2021-12-01             29.6
12 2022-01-01             30.0
13 2022-02-01             32.0
14 2022-03-01             35.7
15 2022-04-01             38.3
16 2022-05-01             36.1
17 2022-06-01             36.0
18 2022-07-01             33.5
19 2022-08-01             33.6
20 2022-09-01             33.1
21 2022-10-01             31.1
22 2022-11-01             28.5
23 2022-12-01             28.3
24 2023-01-01             28.6
25 2023-02-01             32.0
26 2023-03-01             33.5
27 2023-04-01             34.7
28 2023-05-01             37.8
29 2023-06-01             37.3
30 2023-07-01             34.3
31 2023-

In [24]:
import os
import glob

def process_region_weather_data(region_code):
    """
    Process all weather CSV files for a given region and create a combined time series.
    
    Parameters:
    region_code (str): Region code (e.g., 'AP', 'KA', 'KL', 'TL', 'TN')
    
    Returns:
    pd.DataFrame: Combined time series with columns for Max_Temp, Min_Temp, Humidity, Wind_Speed
    """
    
    # Define the weather parameters and their corresponding file patterns
    weather_params = {
        'Max_Temp': f'{region_code}_Max_temp.csv',
        'Min_Temp': f'{region_code}_Min_temp.csv', 
        'Humidity': f'{region_code}_Humidify.csv',
        'Wind_Speed': f'{region_code}_Wind_Speed.csv'
    }
    
    # Dictionary to store processed data for each parameter
    processed_data = {}
    
    # Process each weather parameter
    for param_name, filename in weather_params.items():
        filepath = f"./regional_data/{filename}"
        
        if not os.path.exists(filepath):
            print(f"Warning: File {filepath} not found. Skipping {param_name}.")
            continue
            
        print(f"Processing {param_name} from {filename}...")
        
        # Read the CSV file
        df = pd.read_csv(filepath)
        
        # Store the Average column values before dropping the Average row
        average_values = df[df['Month'] == '**Average**']['Average'].iloc[0] if '**Average**' in df['Month'].values else None
        
        # Drop the '**Average**' row
        df = df[df['Month'] != '**Average**']
        
        # Replace '-' with the Average column value for each row
        for index, row in df.iterrows():
            if row['Average'] != '-':  # Make sure Average column has a valid value
                avg_value = float(row['Average'])
                # Replace '-' in all columns except 'Month' and 'Average'
                for col in df.columns:
                    if col not in ['Month', 'Average'] and row[col] == '-':
                        df.at[index, col] = avg_value
        
        # Convert numeric columns to float (excluding Month)
        numeric_cols = [col for col in df.columns if col != 'Month']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Drop the Average column
        df.drop(columns=['Average'], inplace=True)
        
        # Melt the dataframe to long format
        df_long = df.melt(id_vars=['Month'], var_name='Year', value_name='Value')
        
        # Convert Value to numeric (NaN will stay)
        df_long[param_name] = pd.to_numeric(df_long['Value'], errors='coerce')
        
        # Map month names to numbers
        month_map = { 
            'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
            'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
        }
        df_long['MonthNum'] = df_long['Month'].map(month_map)
        
        # Create proper datetime
        df_long['Date'] = pd.to_datetime(
            df_long['Year'].astype(str) + '-' + df_long['MonthNum'].astype(str).str.zfill(2) + '-01',
            format='%Y-%m-%d',
            errors='coerce'
        )
        
        # Drop rows where Date is NaT and sort by date
        df_long = df_long.dropna(subset=['Date']).sort_values('Date').reset_index(drop=True)
        
        # Store the processed data
        processed_data[param_name] = df_long[['Date', param_name]].copy()
        
        print(f"  - {param_name}: {len(df_long)} records from {df_long['Date'].min()} to {df_long['Date'].max()}")
    
    # Combine all weather parameters into a single dataframe
    if not processed_data:
        print(f"No data found for region {region_code}")
        return pd.DataFrame()
    
    # Start with the first parameter's data
    combined_df = list(processed_data.values())[0].copy()
    
    # Merge other parameters
    for param_name, param_df in list(processed_data.items())[1:]:
        combined_df = combined_df.merge(param_df, on='Date', how='outer')
    
    # Sort by date and reset index
    combined_df = combined_df.sort_values('Date').reset_index(drop=True)
    
    print(f"\nCombined dataset for {region_code}:")
    print(f"  - Total records: {len(combined_df)}")
    print(f"  - Date range: {combined_df['Date'].min()} to {combined_df['Date'].max()}")
    print(f"  - Columns: {list(combined_df.columns)}")
    
    return combined_df

# Test with Andhra Pradesh (AP)
ap_weather_data = process_region_weather_data('AP')
print("\nFirst 10 rows of AP weather data:")
print(ap_weather_data.head(10))


Processing Max_Temp from AP_Max_temp.csv...
  - Max_Temp: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Min_Temp from AP_Min_temp.csv...
  - Min_Temp: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Humidity from AP_Humidify.csv...
  - Humidity: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Wind_Speed from AP_Wind_Speed.csv...
  - Wind_Speed: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00

Combined dataset for AP:
  - Total records: 60
  - Date range: 2021-01-01 00:00:00 to 2025-12-01 00:00:00
  - Columns: ['Date', 'Max_Temp', 'Min_Temp', 'Humidity', 'Wind_Speed']

First 10 rows of AP weather data:
        Date  Max_Temp  Min_Temp  Humidity  Wind_Speed
0 2021-01-01      29.3      22.0      74.0         8.0
1 2021-02-01      31.3      20.0      64.0         9.0
2 2021-03-01      36.1      23.0      53.0         9.0
3 2021-04-01      37.6      27.0      54.0         9.0
4 2021-05-01      36.7      29.0     

In [25]:
# Let's test the function with different regions
regions = ['AP', 'KA', 'KL', 'TL', 'TN']  # Available regions
all_region_data = {}

print("Processing weather data for all regions...\n")

for region in regions:
    print(f"=== Processing {region} ===")
    region_data = process_region_weather_data(region)
    if not region_data.empty:
        all_region_data[region] = region_data
    print()

# Display summary statistics for all regions
print("=== SUMMARY STATISTICS ===")
for region, data in all_region_data.items():
    print(f"\n{region} Region:")
    print(f"  Records: {len(data)}")
    print(f"  Date range: {data['Date'].min().strftime('%Y-%m')} to {data['Date'].max().strftime('%Y-%m')}")
    print(f"  Missing values:")
    for col in ['Max_Temp', 'Min_Temp', 'Humidity', 'Wind_Speed']:
        if col in data.columns:
            missing = data[col].isna().sum()
            print(f"    {col}: {missing} ({missing/len(data)*100:.1f}%)")


Processing weather data for all regions...

=== Processing AP ===
Processing Max_Temp from AP_Max_temp.csv...
  - Max_Temp: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Min_Temp from AP_Min_temp.csv...
  - Min_Temp: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Humidity from AP_Humidify.csv...
  - Humidity: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Wind_Speed from AP_Wind_Speed.csv...
  - Wind_Speed: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00

Combined dataset for AP:
  - Total records: 60
  - Date range: 2021-01-01 00:00:00 to 2025-12-01 00:00:00
  - Columns: ['Date', 'Max_Temp', 'Min_Temp', 'Humidity', 'Wind_Speed']

=== Processing KA ===
Processing Max_Temp from KA_Max_temp.csv...
  - Max_Temp: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Min_Temp from KA_Min_temp.csv...
  - Min_Temp: 60 records from 2021-01-01 00:00:00 to 2025-12-01 00:00:00
Processing Humidity from

In [26]:
# Save processed data to CSV files
import os

# Create output directory if it doesn't exist
output_dir = "./outputs/processed_weather_data"
os.makedirs(output_dir, exist_ok=True)

print("Saving processed weather data...")

for region, data in all_region_data.items():
    filename = f"{output_dir}/{region}_weather_timeseries.csv"
    data.to_csv(filename, index=False)
    print(f"  - Saved {region} data to {filename}")

print(f"\nAll processed data saved to {output_dir}/")

# Display sample of the combined data for AP region
print("\n=== SAMPLE DATA (AP Region) ===")
if 'AP' in all_region_data:
    ap_data = all_region_data['AP']
    print(ap_data.head(15))
    
    # Show data types and basic info
    print(f"\nData types:")
    print(ap_data.dtypes)
    
    print(f"\nBasic statistics:")
    print(ap_data.describe())


Saving processed weather data...
  - Saved AP data to ./outputs/processed_weather_data/AP_weather_timeseries.csv
  - Saved KA data to ./outputs/processed_weather_data/KA_weather_timeseries.csv
  - Saved KL data to ./outputs/processed_weather_data/KL_weather_timeseries.csv
  - Saved TL data to ./outputs/processed_weather_data/TL_weather_timeseries.csv
  - Saved TN data to ./outputs/processed_weather_data/TN_weather_timeseries.csv

All processed data saved to ./outputs/processed_weather_data/

=== SAMPLE DATA (AP Region) ===
         Date  Max_Temp  Min_Temp  Humidity  Wind_Speed
0  2021-01-01      29.3      22.0      74.0         8.0
1  2021-02-01      31.3      20.0      64.0         9.0
2  2021-03-01      36.1      23.0      53.0         9.0
3  2021-04-01      37.6      27.0      54.0         9.0
4  2021-05-01      36.7      29.0      55.0         8.0
5  2021-06-01      36.2      28.0      53.0         9.0
6  2021-07-01      35.1      28.0      64.0         9.0
7  2021-08-01      34.1

In [None]:
# Create a simple visualization of the weather data
import matplotlib.pyplot as plt

# Plot weather data for AP region as an example
if 'AP' in all_region_data:
    ap_data = all_region_data['AP']
    
    # Create subplots for each weather parameter
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Weather Time Series - Andhra Pradesh (AP)', fontsize=16)
    
    # Plot Max Temperature
    axes[0, 0].plot(ap_data['Date'], ap_data['Max_Temp'], 'r-', linewidth=1)
    axes[0, 0].set_title('Maximum Temperature (°C)')
    axes[0, 0].set_ylabel('Temperature (°C)')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Plot Min Temperature
    axes[0, 1].plot(ap_data['Date'], ap_data['Min_Temp'], 'b-', linewidth=1)
    axes[0, 1].set_title('Minimum Temperature (°C)')
    axes[0, 1].set_ylabel('Temperature (°C)')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Plot Humidity
    axes[1, 0].plot(ap_data['Date'], ap_data['Humidity'], 'g-', linewidth=1)
    axes[1, 0].set_title('Humidity (%)')
    axes[1, 0].set_ylabel('Humidity (%)')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Plot Wind Speed
    axes[1, 1].plot(ap_data['Date'], ap_data['Wind_Speed'], 'orange', linewidth=1)
    axes[1, 1].set_title('Wind Speed (km/h)')
    axes[1, 1].set_ylabel('Wind Speed (km/h)')
    axes[1, 1].grid(True, alpha=0.3)
    
    # Rotate x-axis labels for better readability
    for ax in axes.flat:
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Save the plot
    plt.savefig(f"{output_dir}/AP_weather_timeseries_plot.png", dpi=300, bbox_inches='tight')
    print(f"Plot saved to {output_dir}/AP_weather_timeseries_plot.png")
else:
    print("No AP data available for plotting")
