In [2]:
import pandas as pd

file_path = "Documents/r115_07tb2.txt"

# Step 1: Read all lines (including junk)
df = pd.read_csv(file_path,
                 sep=r'\s+',
                 header=None,
                 names=["INDEX", "YEAR", "MONTH", "DAY", "MAX", "MIN", "RAINFALL"],
                 skiprows=8, 
                 engine='python',
                 on_bad_lines='skip')

# Step 2: Remove lines with junk headers or dashes
df = df[~df["INDEX"].astype(str).str.contains("^-+|^INDEX", regex=True)]

# Step 3: Convert columns to appropriate numeric types
df[["INDEX", "YEAR", "MONTH", "DAY"]] = df[["INDEX", "YEAR", "MONTH", "DAY"]].apply(pd.to_numeric, errors='coerce')
df[["MAX", "MIN", "RAINFALL"]] = df[["MAX", "MIN", "RAINFALL"]].apply(pd.to_numeric, errors='coerce')

# Step 4: Drop rows missing date-related fields
df = df.dropna(subset=["INDEX", "YEAR", "MONTH", "DAY"])

# Step 5: Filter for station 42492
df_42492 = df[df["INDEX"] == 42492].copy()

# Step 6: Reset index to start from 1
df_42492.reset_index(drop=True, inplace=True)
df_42492.index = df_42492.index + 1

# Show the cleaned DataFrame for station 42492
df_42492


Unnamed: 0,INDEX,YEAR,MONTH,DAY,MAX,MIN,RAINFALL
1,42492.0,1970.0,1.0,1,24.2,10.6,0.0
2,42492.0,1970.0,1.0,2,23.3,8.7,0.0
3,42492.0,1970.0,1.0,3,23.4,8.4,0.0
4,42492.0,1970.0,1.0,4,23.0,8.2,0.0
5,42492.0,1970.0,1.0,5,23.9,9.2,0.0
...,...,...,...,...,...,...,...
14371,42492.0,2019.0,8.0,27,34.8,27.4,1.2
14372,42492.0,2019.0,8.0,28,36.0,28.0,0.0
14373,42492.0,2019.0,8.0,29,35.0,29.2,0.0
14374,42492.0,2019.0,8.0,30,36.0,27.5,16.8


In [4]:
# Step 1: Create daily averages from available data for station 42492
daily_avg_42492 = (
    df_42492.groupby(["MONTH", "DAY"])[["MAX", "MIN", "RAINFALL"]]
    .mean()
    .reset_index()
    .rename(columns={
        "MAX": "MAX_IMPUTED",
        "MIN": "MIN_IMPUTED",
        "RAINFALL": "RAINFALL_IMPUTED"
    })
)

# Step 2: Merge with full dataset
df_imputed_42492 = df_42492.merge(daily_avg_42492, on=["MONTH", "DAY"], how="left")

# Step 3: Impute missing values using daily averages
df_imputed_42492["MAX"] = df_imputed_42492["MAX"].fillna(df_imputed_42492["MAX_IMPUTED"])
df_imputed_42492["MIN"] = df_imputed_42492["MIN"].fillna(df_imputed_42492["MIN_IMPUTED"])
df_imputed_42492["RAINFALL"] = df_imputed_42492["RAINFALL"].fillna(df_imputed_42492["RAINFALL_IMPUTED"])

# Step 4: Drop helper columns
df_imputed_42492.drop(columns=["MAX_IMPUTED", "MIN_IMPUTED", "RAINFALL_IMPUTED"], inplace=True)

# Optional: Reset index for neatness
df_imputed_42492.sort_values(["YEAR", "MONTH", "DAY"], inplace=True)
df_imputed_42492.reset_index(drop=True, inplace=True)

# Optional: Check if any missing values remain
print(df_imputed_42492.isna().sum())


INDEX       0
YEAR        0
MONTH       0
DAY         0
MAX         0
MIN         0
RAINFALL    0
dtype: int64


In [12]:
# Step 1: Create "YEAR_MONTH" column
df_imputed_42492["YEAR_MONTH"] = (
    df_imputed_42492["YEAR"].astype(int).astype(str) + "-" +
    df_imputed_42492["MONTH"].astype(int).astype(str).str.zfill(2)
)

# Step 2: Group by YEAR_MONTH and aggregate
monthly_agg_42492 = df_imputed_42492.groupby("YEAR_MONTH").agg({
    "MAX": "mean",       # Average Max Temp
    "MIN": "mean",       # Average Min Temp
    "RAINFALL": "sum"    # Cumulative Rainfall
})

# Step 3: Round values to 2 decimals
monthly_agg_42492 = monthly_agg_42492.round(2)

# Step 4: Reset index to make YEAR_MONTH a column
monthly_agg_42492 = monthly_agg_42492.reset_index()

# Step 5: Set index to start from 1
monthly_agg_42492.index = monthly_agg_42492.index + 1

# Display all rows without truncation
pd.set_option("display.max_rows", None)
monthly_agg_42492


Unnamed: 0,YEAR_MONTH,MAX,MIN,RAINFALL
1,1970-01,22.7,10.09,20.8
2,1970-02,24.96,11.1,10.0
3,1970-03,31.18,16.61,32.48
4,1970-04,37.45,22.81,1.2
5,1970-05,38.49,25.83,111.0
6,1970-06,34.62,26.71,83.2
7,1970-07,32.98,26.7,353.3
8,1970-08,32.65,26.62,225.8
9,1970-09,31.75,25.76,225.1
10,1970-10,31.37,22.52,31.5


In [14]:
import pandas as pd

# === Step 1: Read the ICRISAT monthly data files ===
max_df = pd.read_csv("Documents/ICRISAT-District Level Data-3.csv")
min_df = pd.read_csv("Documents/ICRISAT-District Level Data-4.csv")
rain_df = pd.read_csv("Documents/ICRISAT-District Level Data-5.csv")  # Adjust filename if needed

# Merge MAX, MIN, and RAINFALL data
merged_temp = pd.merge(max_df, min_df, on=["Dist Code", "Year", "State Code", "State Name", "Dist Name"], how="inner")
merged_all = pd.merge(merged_temp, rain_df, on=["Dist Code", "Year", "State Code", "State Name", "Dist Name"], how="inner")

# === Step 2: Reshape the merged ICRISAT data to long format ===
def melt_icrisat(df, value_type):
    month_map = {
        "JANUARY": "01", "FEBRUARY": "02", "MARCH": "03", "APRIL": "04",
        "MAY": "05", "JUNE": "06", "JULY": "07", "AUGUST": "08",
        "SEPTEMBER": "09", "OCTOBER": "10", "NOVEMBER": "11", "DECEMBER": "12"
    }
    id_vars = ["Year"]
    value_vars = [col for col in df.columns if value_type in col]
    df_melted = df.melt(id_vars=id_vars, value_vars=value_vars, var_name="Month", value_name=value_type)
    df_melted["Month"] = df_melted["Month"].str.extract(r"(\w+)")
    df_melted["MONTH"] = df_melted["Month"].map(month_map)
    df_melted["YEAR_MONTH"] = df_melted["Year"].astype(str) + "-" + df_melted["MONTH"]
    return df_melted[["YEAR_MONTH", value_type]]

# Create long-form DataFrames
max_long = melt_icrisat(merged_all, "MAXIMUM")
min_long = melt_icrisat(merged_all, "MINIMUM")
rain_long = melt_icrisat(merged_all, "PERCIPITATION")

# Merge all three into one long DataFrame
icrisat_long = max_long.merge(min_long, on="YEAR_MONTH").merge(rain_long, on="YEAR_MONTH")
icrisat_long.rename(columns={
    "MAXIMUM": "MAX",
    "MINIMUM": "MIN",
    "PERCIPITATION": "RAINFALL"
}, inplace=True)

# Round values
icrisat_long = icrisat_long.round(2)

# === Step 3: Merge ICRISAT data with your original monthly dataset ===
monthly_merged = pd.concat([monthly_agg_42492, icrisat_long], ignore_index=True)
monthly_merged.drop_duplicates(subset=["YEAR_MONTH"], keep="first", inplace=True)
monthly_merged = monthly_merged.sort_values("YEAR_MONTH").reset_index(drop=True)
monthly_merged.index = monthly_merged.index + 1

# Final dataset
monthly_merged



Unnamed: 0,YEAR_MONTH,MAX,MIN,RAINFALL
1,1970-01,22.7,10.09,20.8
2,1970-02,24.96,11.1,10.0
3,1970-03,31.18,16.61,32.48
4,1970-04,37.45,22.81,1.2
5,1970-05,38.49,25.83,111.0
6,1970-06,34.62,26.71,83.2
7,1970-07,32.98,26.7,353.3
8,1970-08,32.65,26.62,225.8
9,1970-09,31.75,25.76,225.1
10,1970-10,31.37,22.52,31.5


In [16]:
monthly_merged.to_csv("monthly_merged.csv")

In [18]:
import numpy as np
import pandas as pd
from pymannkendall import original_test 
from statsmodels.tsa.stattools import acf

def prewhiten(series):
    r1 = acf(series, nlags=1, fft=False)[1]
    if abs(r1) < 0.1:
        return series, r1
    series_pw = series[1:] - r1 * series[:-1].values
    return series_pw, r1

def mk_with_autocorr_check(series):
    series = series.dropna()
    if len(series) < 10:
        return None, np.nan, "Insufficient data"
    r1 = acf(series, nlags=1, fft=False)[1]
    threshold = 0.2
    if abs(r1) > threshold:
        series_pw, r1_pw = prewhiten(series)
        if len(series_pw) < 10:
            return None, r1, "Prewhitened series too short"
        result = original_test(series_pw)
        method = "Modified MK"
    else:
        result = original_test(series)
        method = "Original MK"
    return result, r1, method

def apply_mk_test_with_autocorr(group):
    data = group[['MAX', 'MIN', 'RAINFALL']]
    results = []
    month_num = group.name
    for var in data.columns:
        series = data[var]
        result, r1, method = mk_with_autocorr_check(series)
        if result is None:
            trend = "Insufficient data"
            p_value = np.nan
            slope = np.nan
        else:
            trend = result.trend
            p_value = result.p
            slope = result.slope
        
        results.append({
            'Month': month_num,
            'Month_Name': pd.Timestamp(month=month_num, day=1, year=2000).strftime('%B'),
            'Variable': var,
            'Trend': trend,
            'p_value': p_value,
            'Sen_slope': slope,
            'Autocorr_lag1': r1,
            'Method_used': method
        })
    return pd.DataFrame(results)

# Extract month number from YEAR_MONTH in monthly_merged
monthly_merged['Month'] = monthly_merged['YEAR_MONTH'].str.split('-').str[1].astype(int)

# Apply Mann-Kendall test with autocorrelation check
results_df = monthly_merged.groupby('Month').apply(apply_mk_test_with_autocorr).reset_index(drop=True)

print(results_df)


    Month Month_Name  Variable       Trend       p_value  Sen_slope  \
0       1    January       MAX  decreasing  1.619263e-02  -0.024400   
1       1    January       MIN    no trend  6.284406e-02   0.024000   
2       1    January  RAINFALL    no trend  8.773170e-01   0.014286   
3       2   February       MAX    no trend  2.588875e-01   0.017059   
4       2   February       MIN  increasing  1.328644e-02   0.035323   
5       2   February  RAINFALL    no trend  9.333358e-01   0.005836   
6       3      March       MAX    no trend  1.000000e+00   0.000000   
7       3      March       MIN  increasing  8.535412e-07   0.051852   
8       3      March  RAINFALL    no trend  8.962815e-01   0.000000   
9       4      April       MAX    no trend  5.469946e-01  -0.010556   
10      4      April       MIN  increasing  1.570665e-05   0.056808   
11      4      April  RAINFALL    no trend  4.351112e-01   0.031667   
12      5        May       MAX    no trend  8.187826e-02   0.029462   
13    

  results_df = monthly_merged.groupby('Month').apply(apply_mk_test_with_autocorr).reset_index(drop=True)


In [19]:
def format_significance(p):
    if pd.isna(p):
        return 'NS'
    if p < 0.001:
        return '< 0.001'
    elif p < 0.01:
        return '1%'
    elif p < 0.05:
        return '5%'
    elif p < 0.10:
        return '10%'
    else:
        return 'NS'

def summarize_mk_results(results_df, var):
    # Filter only rows for this variable
    df_var = results_df[results_df['Variable'] == var].copy()
    
    # Prepare dataframe with desired columns and renaming
    summary = pd.DataFrame({
        'Month_Name': df_var['Month_Name'],
        f'{var}_MKz': df_var['Sen_slope'] / (df_var['Sen_slope'].std() if df_var['Sen_slope'].std() != 0 else 1),  # rough z-value proxy
        f'{var}_sig': df_var['p_value'].apply(format_significance),
        f'{var}_slope': df_var['Sen_slope'].round(4),
        f'{var}_RC': df_var['Autocorr_lag1'].round(2)
    })
    
    # Sort by Month number ascending (January to December)
    month_order = pd.to_datetime(summary['Month_Name'], format='%B').dt.month
    summary['Month_Order'] = month_order
    summary = summary.sort_values('Month_Order').drop(columns=['Month_Order']).reset_index(drop=True)
    
    return summary

# Use MK test results from monthly_merged dataset stored in results_df
max_summary = summarize_mk_results(results_df, 'MAX')
min_summary = summarize_mk_results(results_df, 'MIN')
rainfall_summary = summarize_mk_results(results_df, 'RAINFALL')

print('--- MAX Trend ---')
print(max_summary.to_string(index=False))

print('\n--- MIN Trend ---')
print(min_summary.to_string(index=False))

print('\n--- RAINFALL Trend ---')
print(rainfall_summary.to_string(index=False))


--- MAX Trend ---
Month_Name   MAX_MKz MAX_sig  MAX_slope  MAX_RC
   January -1.580163      5%    -0.0244   -0.05
  February  1.104742      NS     0.0171    0.02
     March  0.000000      NS     0.0000   -0.08
     April -0.683586      NS    -0.0106   -0.17
       May  1.908002     10%     0.0295    0.35
      June  1.511084      NS     0.0233   -0.12
      July  0.422991      NS     0.0065    0.24
    August  0.219063      NS     0.0034    0.26
 September  0.505450      NS     0.0078    0.12
   October -0.283328      NS    -0.0044    0.05
  November -0.208160      NS    -0.0032    0.10
  December -0.809510     10%    -0.0125    0.12

--- MIN Trend ---
Month_Name  MIN_MKz MIN_sig  MIN_slope  MIN_RC
   January 1.661160     10%     0.0240    0.11
  February 2.444915      5%     0.0353    0.37
     March 3.588925 < 0.001     0.0519    0.16
     April 3.931995 < 0.001     0.0568    0.36
       May 2.533152      1%     0.0366    0.31
      June 1.845733      1%     0.0267    0.15
      July

In [25]:
# Export each summary to a CSV file
max_summary.to_csv("monthly_merged_max_trend.csv", index=False)
min_summary.to_csv("monthly_merged_min_trend.csv", index=False)
rainfall_summary.to_csv("monthly_merged_rainfall_trend.csv", index=False)
