In [15]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.preprocessing import LabelEncoder

In [16]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight 

In [17]:
data=pd.read_csv("weatherAUS.csv")

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [19]:
data = data.drop(columns=['Date'])
data=data.dropna()

In [20]:
data.isnull().sum()

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [21]:
label_encoder = LabelEncoder()
data['Location'] = label_encoder.fit_transform(data['Location'])
data['WindGustDir'] = label_encoder.fit_transform(data['WindGustDir'])
data['WindDir9am'] = label_encoder.fit_transform(data['WindDir9am'])
data['WindDir3pm'] = label_encoder.fit_transform(data['WindDir3pm'])
data['RainToday'] = label_encoder.fit_transform(data['RainToday'])
data['RainTomorrow'] = label_encoder.fit_transform(data['RainTomorrow'])
data['Evaporation'] = label_encoder.fit_transform(data['Evaporation'])
data['Sunshine'] = label_encoder.fit_transform(data['Sunshine'])
data['Cloud9am'] = label_encoder.fit_transform(data['Cloud9am'])
data['Cloud3pm'] = label_encoder.fit_transform(data['Cloud3pm'])

In [22]:
X = data.drop('RainTomorrow', axis=1)
y = data['RainTomorrow']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [24]:
from collections import Counter

In [25]:
# Random Under Sampling
print("Original class distribution:", Counter(y_train))
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train) 
print("Original class distribution:", Counter(y_rus))

Original class distribution: Counter({0: 35194, 1: 9942})
Original class distribution: Counter({0: 9942, 1: 9942})


In [26]:
# Random Over Sampling
from imblearn.over_sampling import RandomOverSampler
print("Original class distribution:", Counter(y_train))
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train) 
print("Original class distribution:", Counter(y_ros))

Original class distribution: Counter({0: 35194, 1: 9942})
Original class distribution: Counter({0: 35194, 1: 35194})


In [27]:
# SMOTE(Synthetic Minority Oversampling Technique)
print("Original class distribution:", Counter(y_train))
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_smote))

Original class distribution: Counter({0: 35194, 1: 9942})
Original class distribution: Counter({0: 35194, 1: 35194})


In [28]:
# Tomek Links
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_train, y_train) 

In [29]:
# Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights)) 
class_weights

array([0.64124567, 2.2699658 ])

In [30]:
# Function to evaluate model performance
def evaluate_model(X_train, y_train, X_test, y_test, class_weights=None):
    model = RandomForestClassifier(class_weight=class_weights, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print(classification_report(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_prob)) 

In [31]:
# Evaluate each sampling technique
print("Random Undersampling:")
evaluate_model(X_rus, y_rus, X_test, y_test)

print("\nRandom Oversampling:")
evaluate_model(X_ros, y_ros, X_test, y_test)

print("\nSMOTE:")
evaluate_model(X_smote, y_smote, X_test, y_test)

#print("\nTomek Links:")
#evaluate_model(X_tl, y_tl, X_test, y_test)

print("\nClass Weights:")
evaluate_model(X_train, y_train, X_test, y_test, class_weights_dict) 

Random Undersampling:
              precision    recall  f1-score   support

           0       0.94      0.80      0.86      8799
           1       0.54      0.81      0.64      2485

    accuracy                           0.80     11284
   macro avg       0.74      0.81      0.75     11284
weighted avg       0.85      0.80      0.82     11284

AUC: 0.8928300568269258

Random Oversampling:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      8799
           1       0.72      0.60      0.65      2485

    accuracy                           0.86     11284
   macro avg       0.80      0.77      0.78     11284
weighted avg       0.85      0.86      0.86     11284

AUC: 0.8967990006180967

SMOTE:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      8799
           1       0.64      0.67      0.66      2485

    accuracy                           0.85     11284
   macro avg       0.77     