In [3]:
import pandas as pd

file_path = "Documents/r115_07tb2.txt"

# Step 1: Read all lines (including junk)
df = pd.read_csv(file_path,
                 sep=r'\s+',
                 header=None,
                 names=["INDEX", "YEAR", "MONTH", "DAY", "MAX", "MIN", "RAINFALL"],
                 skiprows=8, 
                 engine='python',
                 on_bad_lines='skip')

# Step 2: Remove lines with junk headers or dashes
df = df[~df["INDEX"].astype(str).str.contains("^-+|^INDEX", regex=True)]

# Step 3: Convert columns to appropriate numeric types
df[["INDEX", "YEAR", "MONTH", "DAY"]] = df[["INDEX", "YEAR", "MONTH", "DAY"]].apply(pd.to_numeric, errors='coerce')
df[["MAX", "MIN", "RAINFALL"]] = df[["MAX", "MIN", "RAINFALL"]].apply(pd.to_numeric, errors='coerce')

# Step 4: Drop rows missing date-related fields
df = df.dropna(subset=["INDEX", "YEAR", "MONTH", "DAY"])

# Step 5: Filter for station 42410
df_42410 = df[df["INDEX"] == 42410].copy()

# Step 6: Reset index to start from 1
df_42410.reset_index(drop=True, inplace=True)
df_42410.index = df_42410.index + 1

# Show the cleaned DataFrame
df_42410


Unnamed: 0,INDEX,YEAR,MONTH,DAY,MAX,MIN,RAINFALL
1,42410.0,1970.0,1.0,1,23.6,8.5,0.0
2,42410.0,1970.0,1.0,2,23.3,8.0,0.0
3,42410.0,1970.0,1.0,3,23.7,7.9,0.0
4,42410.0,1970.0,1.0,4,23.7,8.4,0.0
5,42410.0,1970.0,1.0,5,23.4,9.2,0.0
...,...,...,...,...,...,...,...
18350,42410.0,2020.0,6.0,26,29.0,23.8,33.1
18351,42410.0,2020.0,6.0,27,28.1,23.2,36.0
18352,42410.0,2020.0,6.0,28,27.5,23.0,33.0
18353,42410.0,2020.0,6.0,29,32.0,25.4,1.2


In [5]:
# Step 1: Create daily averages from available data
daily_avg = (
    df_42410.groupby(["MONTH", "DAY"])[["MAX", "MIN", "RAINFALL"]]
    .mean()
    .reset_index()
    .rename(columns={
        "MAX": "MAX_IMPUTED",
        "MIN": "MIN_IMPUTED",
        "RAINFALL": "RAINFALL_IMPUTED"
    })
)

# Step 2: Merge with full dataset
df_imputed = df_42410.merge(daily_avg, on=["MONTH", "DAY"], how="left")

# Step 3: Impute missing values
df_imputed["MAX"] = df_imputed["MAX"].fillna(df_imputed["MAX_IMPUTED"])
df_imputed["MIN"] = df_imputed["MIN"].fillna(df_imputed["MIN_IMPUTED"])
df_imputed["RAINFALL"] = df_imputed["RAINFALL"].fillna(df_imputed["RAINFALL_IMPUTED"])

# Step 4: Drop helper columns
df_imputed.drop(columns=["MAX_IMPUTED", "MIN_IMPUTED", "RAINFALL_IMPUTED"], inplace=True)

# Optional: Reset index
df_imputed.sort_values(["YEAR", "MONTH", "DAY"], inplace=True)
df_imputed.reset_index(drop=True, inplace=True)

# Optional: Check if any missing values remain
print(df_imputed.isna().sum())


INDEX       0
YEAR        0
MONTH       0
DAY         0
MAX         0
MIN         0
RAINFALL    0
dtype: int64


In [7]:
# Step 1: Create "YEAR_MONTH" column
df_imputed["YEAR_MONTH"] = (
    df_imputed["YEAR"].astype(int).astype(str) + "-" +
    df_imputed["MONTH"].astype(int).astype(str).str.zfill(2)
)

# Step 2: Group by YEAR_MONTH
monthly_agg = df_imputed.groupby("YEAR_MONTH").agg({
    "MAX": "mean",       # Average Max Temp
    "MIN": "mean",       # Average Min Temp
    "RAINFALL": "sum"    # Cumulative Rainfall
})

# Step 3: Round values
monthly_agg = monthly_agg.round(2)

# Step 4: Reset index to bring YEAR_MONTH as a column
monthly_agg = monthly_agg.reset_index()

# Step 5: Start index from 1
monthly_agg.index = monthly_agg.index + 1

# Display all rows
pd.set_option("display.max_rows", None)
monthly_agg


Unnamed: 0,YEAR_MONTH,MAX,MIN,RAINFALL
1,1970-01,22.77,9.88,21.9
2,1970-02,26.05,11.34,23.7
3,1970-03,29.99,15.62,22.6
4,1970-04,31.86,20.41,60.5
5,1970-05,32.09,22.77,136.4
6,1970-06,30.74,25.05,438.5
7,1970-07,31.56,25.62,422.9
8,1970-08,29.69,25.68,318.27
9,1970-09,31.52,24.96,155.7
10,1970-10,29.58,22.24,161.1


In [9]:
monthly_agg.to_csv("monthly_agg.csv")

In [11]:
import numpy as np
import pandas as pd
from pymannkendall import original_test 
from statsmodels.tsa.stattools import acf

def prewhiten(series):
    """
    Prewhiten the series to remove lag-1 autocorrelation.
    Uses simple AR(1) model correction.
    """
    r1 = acf(series, nlags=1, fft=False)[1]
    if abs(r1) < 0.1:
        # No significant autocorr, return original
        return series, r1
    
    # Prewhitened series
    series_pw = series[1:] - r1 * series[:-1].values
    return series_pw, r1

def mk_with_autocorr_check(series):
    """
    Perform MK test with autocorrelation check and prewhitening if needed.
    Returns test result, lag-1 autocorrelation, and method used.
    """
    # Drop NA
    series = series.dropna()
    
    if len(series) < 10:
        # Not enough data to run MK test reliably
        return None, np.nan, "Insufficient data"
    
    # Calculate lag-1 autocorrelation
    r1 = acf(series, nlags=1, fft=False)[1]
    
    # Threshold for significant autocorrelation (common choice)
    threshold = 0.2
    
    if abs(r1) > threshold:
        # Prewhitening needed
        series_pw, r1_pw = prewhiten(series)
        if len(series_pw) < 10:
            return None, r1, "Prewhitened series too short"
        result = original_test(series_pw)
        method = "Modified MK"
    else:
        # No prewhitening
        result = original_test(series)
        method = "Original MK"
    
    return result, r1, method

def apply_mk_test_with_autocorr(group):
    data = group[['MAX', 'MIN', 'RAINFALL']]
    results = []
    month_num = group.name
    for var in data.columns:
        series = data[var]
        result, r1, method = mk_with_autocorr_check(series)
        if result is None:
            # Skip or mark no result due to data insufficiency
            trend = "Insufficient data"
            p_value = np.nan
            slope = np.nan
        else:
            trend = result.trend
            p_value = result.p
            slope = result.slope
        
        results.append({
            'Month': month_num,
            'Month_Name': pd.Timestamp(month=month_num, day=1, year=2000).strftime('%B'),
            'Variable': var,
            'Trend': trend,
            'p_value': p_value,
            'Sen_slope': slope,
            'Autocorr_lag1': r1,
            'Method_used': method
        })
    return pd.DataFrame(results)

# Prepare monthly_agg for grouping by extracting month number
monthly_agg['Month'] = monthly_agg['YEAR_MONTH'].str.split('-').str[1].astype(int)

# Group by Month (1 to 12) and apply the MK test with autocorr check
results_df = monthly_agg.groupby('Month').apply(apply_mk_test_with_autocorr).reset_index(drop=True)

# Show results
print(results_df)



    Month Month_Name  Variable       Trend   p_value  Sen_slope  \
0       1    January       MAX  increasing  0.010505   0.027959   
1       1    January       MIN    no trend  0.371557   0.008367   
2       1    January  RAINFALL    no trend  0.201821  -0.061714   
3       2   February       MAX  increasing  0.020960   0.036861   
4       2   February       MIN  increasing  0.041248   0.025804   
5       2   February  RAINFALL    no trend  0.251931  -0.165000   
6       3      March       MAX    no trend  0.055396   0.024167   
7       3      March       MIN  increasing  0.001564   0.027143   
8       3      March  RAINFALL    no trend  0.827829   0.040000   
9       4      April       MAX    no trend  0.302217   0.013478   
10      4      April       MIN    no trend  0.449945   0.007391   
11      4      April  RAINFALL    no trend  0.152858   1.235294   
12      5        May       MAX  increasing  0.028409   0.024203   
13      5        May       MIN    no trend  0.234908   0.00651

  results_df = monthly_agg.groupby('Month').apply(apply_mk_test_with_autocorr).reset_index(drop=True)


In [12]:
def format_significance(p):
    if pd.isna(p):
        return 'NS'
    if p < 0.001:
        return '< 0.001'
    elif p < 0.01:
        return '1%'
    elif p < 0.05:
        return '5%'
    elif p < 0.10:
        return '10%'
    else:
        return 'NS'

def summarize_mk_results(results_df, var):
    # Filter only rows for this variable
    df_var = results_df[results_df['Variable'] == var].copy()
    
    # Prepare dataframe with desired columns and renaming
    summary = pd.DataFrame({
        'Month_Name': df_var['Month_Name'],
        f'{var}_MKz': df_var['Sen_slope'] / (df_var['Sen_slope'].std() if df_var['Sen_slope'].std() != 0 else 1),  # rough z-value proxy
        f'{var}_sig': df_var['p_value'].apply(format_significance),
        f'{var}_slope': df_var['Sen_slope'].round(4),
        f'{var}_RC': df_var['Autocorr_lag1'].round(2)
    })
    
    # Sort by Month number ascending (January to December)
    month_order = pd.to_datetime(summary['Month_Name'], format='%B').dt.month
    summary['Month_Order'] = month_order
    summary = summary.sort_values('Month_Order').drop(columns=['Month_Order']).reset_index(drop=True)
    
    return summary

# Create separate tables
max_summary = summarize_mk_results(results_df, 'MAX')
min_summary = summarize_mk_results(results_df, 'MIN')
rainfall_summary = summarize_mk_results(results_df, 'RAINFALL')

print('--- MAX Trend ---')
print(max_summary.to_string(index=False))

print('\n--- MIN Trend ---')
print(min_summary.to_string(index=False))

print('\n--- RAINFALL Trend ---')
print(rainfall_summary.to_string(index=False))


--- MAX Trend ---
Month_Name  MAX_MKz MAX_sig  MAX_slope  MAX_RC
   January 4.890314      5%     0.0280    0.17
  February 6.447282      5%     0.0369    0.20
     March 4.226969     10%     0.0242    0.04
     April 2.357470      NS     0.0135    0.02
       May 4.233286      5%     0.0242    0.30
      June 5.349246      1%     0.0306    0.27
      July 4.988215      1%     0.0285    0.47
    August 3.666707      1%     0.0210    0.46
 September 4.046832      1%     0.0231    0.48
   October 4.636320      5%     0.0265    0.49
  November 5.095640      1%     0.0291    0.41
  December 4.202349      1%     0.0240    0.39

--- MIN Trend ---
Month_Name  MIN_MKz MIN_sig  MIN_slope  MIN_RC
   January 1.194959      NS     0.0084    0.20
  February 3.685114      5%     0.0258    0.21
     March 3.876331      1%     0.0271    0.17
     April 1.055568      NS     0.0074   -0.18
       May 0.929775      NS     0.0065    0.44
      June 1.871678      5%     0.0131    0.30
      July 2.273045    

In [13]:
# Export to CSV
max_summary.to_csv("max_trend_summary.csv", index=False)
min_summary.to_csv("min_trend_summary.csv", index=False)
rainfall_summary.to_csv("rainfall_trend_summary.csv", index=False)

