In [5]:
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_list = ['mean_temp', 'mean_cloud','total_sun','wind_speed', 'total_rainfall', 'relative_humidity']

#Data Cleaning

df_list = []
for st in data_list:
    df_old = pd.read_csv(f"/workspaces/weather/data/hk_{st}.csv", index_col = False)
    #Dropping all rows of incomplete data
    df1 = df_old.drop(df_old[df_old['complete'] != "C"].index)
    #Combining columns to make 'date' column with datetime type
    df1['date'] = df1['Year'] + '-' + df1['Month'].astype('Int64').astype(str) + '-' + df1['Day'].astype('Int64').astype(str)
    df1['date'] = pd.to_datetime(df1['date'])
    df2 = df1.drop(["Year", 'Month', 'Day', 'complete'], axis = 1)
    df_list.append(df2)
#Combing all dataframes
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['date'], how='inner'), df_list)
#Reordering columns
df = df_merged[['date','mean_temp', 'mean_cloud','total_sun','wind_speed', 'total_rainfall', 'relative_humidity']]
    

df.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10785 entries, 0 to 10784
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               10785 non-null  datetime64[ns]
 1   mean_temp          10785 non-null  object        
 2   mean_cloud         10785 non-null  float64       
 3   total_sun          10785 non-null  float64       
 4   wind_speed         10785 non-null  object        
 5   total_rainfall     10785 non-null  object        
 6   relative_humidity  10785 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 589.9+ KB


In [6]:
#Changing datatypes
df['mean_temp'] = df['mean_temp'].astype(float)
df['wind_speed'] = df['wind_speed'].astype(float)
df['relative_humidity'] = df['relative_humidity'].astype(float)
df['total_rainfall'] = df['total_rainfall'].astype(float)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10785 entries, 0 to 10784
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               10785 non-null  datetime64[ns]
 1   mean_temp          10785 non-null  float64       
 2   mean_cloud         10785 non-null  float64       
 3   total_sun          10785 non-null  float64       
 4   wind_speed         10785 non-null  float64       
 5   total_rainfall     10785 non-null  float64       
 6   relative_humidity  10785 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 589.9 KB


In [7]:
#We want total_rainfall to be a categorial attribute to analyse the chance of rain. Let's also make a month column as they may be correlated.
df.loc[df['total_rainfall'] == 0, 'rain?'] = 0
df.loc[(df['total_rainfall'] > 0) & (df['total_rainfall'] < 10), 'rain?'] = 1
df.loc[(df['total_rainfall'] >= 10) & (df['total_rainfall'] < 30),'rain?'] = 2
df.loc[df['total_rainfall']>= 30, 'rain?'] = 3
df['month'] = df['date'].dt.month.astype(int)
df['rain?'].value_counts()

rain?
0.0    6813
1.0    2488
2.0     801
3.0     683
Name: count, dtype: int64

We see that the data is highly skewed towards 0, i.e. it doesn't rain on most days.

We see there is a correlation between chance of rain. total sunlight, mean amount of cloud, and humdiity. 
We will then use different methods to predict the chance of rain, mainly: Decision Tree, random forest, kNN, and XGBoost.

In [8]:
#Preparing data for a Decision Tree Classifier
data = df.drop(['date', 'total_rainfall'],axis = 1)
X = data.drop(['rain?'], axis = 1 )
y = data['rain?']

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

tree_classifier = DecisionTreeClassifier()
X_train, X_test,y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state = 18)

tree_classifier.fit(X_train, y_train)



We have two options for validating: k-fold cross validation and a validation set. k-fold cross validation can bettter reduce bias and overfitting compared to validation, leading to a more robust model; but at the cost of more computation time. As the dataset at hand is not huge, we can opt for cross_validation, and use the test set only when we have shortlisted models for prediction. 

We will evaluate using k-fold cross validation and the f1 score, which measures precision and recall,  as a metric. We use micro for the average parameter as the data is highly skewed.

In [31]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
y_prediction_tree = cross_val_predict(tree_classifier, X_train, y_train,cv =5)
f1_tree = f1_score(y_train, y_prediction_tree, average  ='micro')
print(f"F1 Score:{f1_tree}")

F1 Score:0.6476587853500232


In [32]:
#Training a Random Forest
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier()
rand_clf.fit(X_train,y_train)
#Following the same validation steps
y_prediction_forest = cross_val_predict(rand_clf, X_train, y_train, cv = 5)
f1_forest = f1_score(y_train, y_prediction_forest, average = 'micro')
print(f"F1 Score:{f1_forest}")

F1 Score:0.7251970329160872


This is better than just using a Decision Tree, but maybe other methods will produce a better accuracy score.

In [33]:
#Training kNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

y_prediction_knn = cross_val_predict(knn, X_train, y_train, cv = 5)
f1_knn = f1_score(y_train, y_prediction_knn, average = 'micro')
print(f"F1 Score:{f1_knn}")

F1 Score:0.7060732498840983


In [34]:
#Training XGboost
import xgboost as xgb

xgbc = xgb.XGBClassifier()
xgbc.fit(X_train, y_train)

y_prediction_xgb = cross_val_predict(xgbc, X_train, y_train, cv = 5)
f1_xgb = f1_score(y_train, y_prediction_xgb, average = 'micro')
print(f"F1 Score:{f1_xgb}")


F1 Score:0.7232267037552156


We see that Random Forests and XGboost perform similarly in the f1 score. We will tune their hyperparameters to find which one is better. We will use RSearchCV for both.