# Rain Prediction in Australia Using Weather Data



In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
df = pd.read_csv('weatherAUS.csv')

# Drop rows with missing target values
df.dropna(subset=['RainTomorrow'], inplace=True)

# Extract year from date
df['Year'] = pd.to_datetime(df['Date']).dt.year

# Split dataset by year
train_df = df[df['Year'] < 2015]
val_df = df[df['Year'] == 2015]
test_df = df[df['Year'] > 2015]

# Explicitly exclude non-feature columns
excluded_cols = ['Date', 'Year', 'RainTomorrow']
input_cols = [col for col in df.columns if col not in excluded_cols]
target_col = 'RainTomorrow'

# Split features and target
X_train, y_train = train_df[input_cols], train_df[target_col]
X_val, y_val = val_df[input_cols], val_df[target_col]
X_test, y_test = test_df[input_cols], test_df[target_col]

# Identify numeric and categorical features
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

# Impute missing numeric values
imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

# Scale numeric features
scaler = MinMaxScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_cols])
X_val_cat = encoder.transform(X_val[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])

# Combine numeric and encoded categorical features
X_train_final = np.hstack([X_train[numeric_cols], X_train_cat])
X_val_final = np.hstack([X_val[numeric_cols], X_val_cat])
X_test_final = np.hstack([X_test[numeric_cols], X_test_cat])

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_final, y_train)

# Evaluate accuracy
val_pred = model.predict(X_val_final)
test_pred = model.predict(X_test_final)
val_accuracy = accuracy_score(y_val, val_pred)
test_accuracy = accuracy_score(y_test, test_pred)

print(f"Validation Accuracy: {val_accuracy:.2%}")
print(f"Test Accuracy: {test_accuracy:.2%}")

# Save model and preprocessing components
model_data = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'input_cols': input_cols,
    'categories': {
        'Location': df['Location'].dropna().unique().tolist(),
        'WindGustDir': df['WindGustDir'].dropna().unique().tolist(),
        'WindDir9am': df['WindDir9am'].dropna().unique().tolist(),
        'WindDir3pm': df['WindDir3pm'].dropna().unique().tolist(),
    }
}
joblib.dump(model_data, 'aussie_rain.joblib')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])
A value is t

Validation Accuracy: 85.61%
Test Accuracy: 84.50%


['aussie_rain.joblib']