In [77]:
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

data_list = ['mean_temp', 'mean_cloud','total_sun','wind_speed', 'total_rainfall', 'relative_humidity']

#Data Cleaning

df_list = []
for st in data_list:
    df_old = pd.read_csv(f"/workspaces/weather/data/hk_{st}.csv", index_col = False)
    #Dropping all rows of incomplete data
    df1 = df_old.drop(df_old[df_old['complete'] != "C"].index)
    #Combining columns to make 'date' column with datetime type
    df1['date'] = df1['Year'] + '-' + df1['Month'].astype('Int64').astype(str) + '-' + df1['Day'].astype('Int64').astype(str)
    df1['date'] = pd.to_datetime(df1['date'])
    df2 = df1.drop(["Year", 'Month', 'Day', 'complete'], axis = 1)
    df_list.append(df2)
#Combing all dataframes
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['date'], how='inner'), df_list)
#Reordering columns
df = df_merged[['date','mean_temp', 'mean_cloud','total_sun','wind_speed', 'total_rainfall', 'relative_humidity']]
    

df.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10785 entries, 0 to 10784
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               10785 non-null  datetime64[ns]
 1   mean_temp          10785 non-null  object        
 2   mean_cloud         10785 non-null  float64       
 3   total_sun          10785 non-null  float64       
 4   wind_speed         10785 non-null  object        
 5   total_rainfall     10785 non-null  object        
 6   relative_humidity  10785 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 589.9+ KB


In [90]:
#Changing datatypes and normalizing features
df['mean_temp'] = df['mean_temp'].astype(float)
df['wind_speed'] = df['wind_speed'].astype(float)
df['relative_humidity'] = df['relative_humidity'].astype(float)
df['total_rainfall'] = df['total_rainfall'].astype(float)

df['wind_speed'] = df['wind_speed']/df['wind_speed'].mean()
df['mean_cloud'] = df['mean_cloud']/100
df['relative_humidity'] = df['relative_humidity']/100


In [91]:
#We want total_rainfall to be a categorial attribute to analyse the chance of rain. Let's also make a month column as they may be correlated.
df.loc[df['total_rainfall'] == 0, 'rain?'] = 0
df.loc[(df['total_rainfall'] > 0) & (df['total_rainfall'] < 10), 'rain?'] = 1
df.loc[(df['total_rainfall'] >= 10) & (df['total_rainfall'] < 30),'rain?'] = 2
df.loc[df['total_rainfall']>= 30, 'rain?'] = 3
df['month'] = df['date'].dt.month.astype(int)
df['rain?'].value_counts()

rain?
0.0    6813
1.0    2488
2.0     801
3.0     683
Name: count, dtype: int64

We see that the data is highly skewed towards 0, i.e. it doesn't rain on most days.

In [92]:
#Preparing data for models
from sklearn.model_selection import train_test_split
data = df.drop(['date', 'total_rainfall'],axis = 1)
X = data.drop(['rain?'], axis = 1 )
data_features = X.keys()
y = data['rain?']
X_train, X_test,y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state = 18)

In [93]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
parameters = { 'max_depth': [3,5,7,9,11], 'min_samples_split': [2,4,6,8,10]}
tree_clf = DecisionTreeClassifier()

g1 = GridSearchCV(tree_clf,parameters, cv = 5, n_jobs = -1)
g1.fit(X_train,y_train)

tree_best_param = g1.best_params_
print(tree_best_param)

tree_best_model = g1.best_estimator_
f1_tree = f1_score(y_test,tree_best_model.predict(X_test), average = 'micro')
print(f'F1 score for Decision Tree: {f1_tree}')


{'max_depth': 7, 'min_samples_split': 4}
F1 score for Decision Tree: 0.7209086694483078


In [94]:
#Training a Random Forest
from sklearn.ensemble import RandomForestClassifier
parameters = {'max_depth': [3,5,7,9,11], 'min_samples_split': [2,4,6,8,10]}
rand_clf = RandomForestClassifier()

g2 = GridSearchCV(rand_clf, parameters, cv = 5, n_jobs = -1)

g2.fit(X_train,y_train)

forest_best_param = g2.best_params_
print(forest_best_param)

forest_best_model = g2.best_estimator_
f1_forest = f1_score(y_test,forest_best_model.predict(X_test), average = 'micro')
print(f"F1 Score for Random Forest:{f1_forest}")

{'max_depth': 9, 'min_samples_split': 8}
F1 Score for Random Forest:0.7338896615669912


In [97]:
#Training kNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle  =True)
parameters = {'n_neighbors': np.arange(2,30,1)}
knn = KNeighborsClassifier()
g3 = GridSearchCV(knn, parameters, cv =kf, n_jobs = -1)
g3.fit(X_train,y_train)

knn_best_param = g3.best_params_
print(knn_best_param)

knn_best_model = g3.best_estimator_
f1_knn = f1_score(y_test,knn_best_model.predict(X_test), average = 'micro')
print(f"F1 Score for kNN:{f1_knn}")

{'n_neighbors': np.int64(22)}
F1 Score for kNN:0.7005099675475197


In [96]:
#Training XGboost
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 5, shuffle = True)

parameters = {'min_child_weight': [1,3,5],'gamma': [0, 0.5, 1],'max_depth':[3,5,7,9],'n_estimators': [20,60,100]}
xgbc = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='multi:softmax')
g4 = GridSearchCV(xgbc, parameters,cv = skf, n_jobs = -1)
g4.fit(X_train, y_train)

xgbc_best_param = g4.best_params_
print(xgbc_best_param)

xgbc_best_model = g4.best_estimator_
f1_xgb = f1_score(y_test, xgbc_best_model.predict(X_test), average = 'micro')
print(f"F1 Score for XGBoost:{f1_xgb}")


{'gamma': 1, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 100}
F1 Score for XGBoost:0.7255447380621233


In [98]:
from sklearn.inspection import permutation_importance
result = permutation_importance(forest_best_model, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)
perm_imp_df = pd.DataFrame({'Feature': data_features, 'Permutation Importance': result.importances_mean}).sort_values('Permutation Importance', ascending=False)
print(perm_imp_df)

             Feature  Permutation Importance
4  relative_humidity                0.077840
0          mean_temp                0.036300
1         mean_cloud                0.031896
5              month                0.031433
2          total_sun                0.022485
3         wind_speed                0.004033


Detailed Analysis:

All models have mediocre F1 scores, which is likely due to a highly imbalanced dataset, which may have affected model training and their precision.

Random Forest performed the best out of all the models, as they are easier to tune and more robust against imbalanced datasets.

XGBoost performed poorly due to poor hyperparameter tuning. Better tuning methods such as Bayesian Optimization could be implemented to further increase the model's F1 score.

Relative humidity was the most important feature in model training, but it's permutation importance score is still very low. This suggests insufficient feature extraction and /or lack of relavent features.


