## Weather Station Data Classification

By excluding 5 of the weather stations from the raw data, the goal is to predict whether they fall in the Northern Third of the UK, Central Third of the UK or Southern Third of the UK.Latitude data for all the weather station is included to classification for each of the weather stations in the training set. To determine the latitude of the lines dividing the UK into three,the most northerly point has latitude 60.9 and the most southerly point has latitude 49.9.

In [1]:
#Packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, roc_auc_score, r2_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing the data

df = pd.read_excel(r"C:\Users\USER\Desktop\DATA\Raw Station Data.xlsx")
df.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,To be del,Latitude,Longitude
0,Aberporth,1941,1,---,---,---,74.7,---,,50.218,-4.57
1,Aberporth,1941,2,---,---,---,69.1,---,,50.218,-4.57
2,Aberporth,1941,3,---,---,---,76.2,---,,50.218,-4.57
3,Aberporth,1941,4,---,---,---,33.7,---,,50.218,-4.57
4,Aberporth,1941,5,---,---,---,51.3,---,,50.218,-4.57


In [3]:
#Summary Statistics
df.describe()

Unnamed: 0,yyyy,mm,Latitude,Longitude
count,39263.0,39263.0,39263.0,39263.0
mean,1971.125564,6.494588,53.951876,-2.734982
std,37.446362,3.455734,2.505778,2.217172
min,1853.0,1.0,50.218,-6.88
25%,1951.0,3.0,51.761,-4.43
50%,1978.0,6.0,53.381,-2.641
75%,2000.0,9.0,55.311,-1.262
max,2024.0,12.0,60.139,1.727


## Data Cleaning

In [4]:
#Removing duplicate data.
df.drop_duplicates()
df.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,To be del,Latitude,Longitude
0,Aberporth,1941,1,---,---,---,74.7,---,,50.218,-4.57
1,Aberporth,1941,2,---,---,---,69.1,---,,50.218,-4.57
2,Aberporth,1941,3,---,---,---,76.2,---,,50.218,-4.57
3,Aberporth,1941,4,---,---,---,33.7,---,,50.218,-4.57
4,Aberporth,1941,5,---,---,---,51.3,---,,50.218,-4.57


In [5]:
#Getting rid of unwanted column
df.drop(columns = "To be del",inplace=True)
df.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
0,Aberporth,1941,1,---,---,---,74.7,---,50.218,-4.57
1,Aberporth,1941,2,---,---,---,69.1,---,50.218,-4.57
2,Aberporth,1941,3,---,---,---,76.2,---,50.218,-4.57
3,Aberporth,1941,4,---,---,---,33.7,---,50.218,-4.57
4,Aberporth,1941,5,---,---,---,51.3,---,50.218,-4.57


In [6]:
#The next step will be to replace the missing values marked by --- with blanks.

df.replace("---","",inplace=True)

df.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
0,Aberporth,1941,1,,,,74.7,,50.218,-4.57
1,Aberporth,1941,2,,,,69.1,,50.218,-4.57
2,Aberporth,1941,3,,,,76.2,,50.218,-4.57
3,Aberporth,1941,4,,,,33.7,,50.218,-4.57
4,Aberporth,1941,5,,,,51.3,,50.218,-4.57


In [7]:
#Converting blanks to NaN using regex

df.replace(r'^\s*$',np.nan,regex=True, inplace=True)
df.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
0,Aberporth,1941,1,,,,74.7,,50.218,-4.57
1,Aberporth,1941,2,,,,69.1,,50.218,-4.57
2,Aberporth,1941,3,,,,76.2,,50.218,-4.57
3,Aberporth,1941,4,,,,33.7,,50.218,-4.57
4,Aberporth,1941,5,,,,51.3,,50.218,-4.57


In [8]:
df.tail() 
#The below shows that some values do have unwanted characters e.g 67.7#

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
39258,Yeovilton,2023,10,17.2,9.2,0,117.2,101.6#,60.139,-2.641
39259,Yeovilton,2023,11,11.9,5.2,4,92.8,67.7#,60.139,-2.641
39260,Yeovilton,2023,12,11.3,5.7,5,135.0,18.6#,60.139,-2.641
39261,Yeovilton,2024,1,8.5,1.4,10,62.0,77.3#,60.139,-2.641
39262,Yeovilton,2024,2,11.8,5.6,3,111.8,43.6#,60.139,-2.641


In [9]:
#From the initial review of the data, unwanted characters include #, $ and * on some of the row data.
#Will be using a regular expression that identifies characters that are not letters or numbers without removing - or . to eliminate this characters.

df.replace(r'[^\w\.\-]',"",regex=True,inplace=True)
df.tail() #Unwanted characters removed

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
39258,Yeovilton,2023,10,17.2,9.2,0,117.2,101.6,60.139,-2.641
39259,Yeovilton,2023,11,11.9,5.2,4,92.8,67.7,60.139,-2.641
39260,Yeovilton,2023,12,11.3,5.7,5,135.0,18.6,60.139,-2.641
39261,Yeovilton,2024,1,8.5,1.4,10,62.0,77.3,60.139,-2.641
39262,Yeovilton,2024,2,11.8,5.6,3,111.8,43.6,60.139,-2.641


In [10]:
#Checking for the number of blank values in each column

df.isna().sum()

Station         0
yyyy            0
mm              0
tmax degC     928
tmin degC     902
af days      2327
rain mm       873
sun hours    9068
Latitude        0
Longitude       0
dtype: int64

In [11]:
#Dropping the blanks would lead to huge data loss and therefore will fill the blanks with average values of each column.
#Will first convert some of the columns into appropriate data types
df.dtypes

Station       object
yyyy           int64
mm             int64
tmax degC     object
tmin degC     object
af days       object
rain mm       object
sun hours     object
Latitude     float64
Longitude    float64
dtype: object

In [12]:
#tmax degC,tmin degC,af days,rain mm,sun hours ----converted to float

df[["tmax degC","tmin degC","af days","rain mm","sun hours"]] = df[["tmax degC","tmin degC","af days","rain mm","sun hours"]].astype(float)

In [13]:
df.dtypes

Station       object
yyyy           int64
mm             int64
tmax degC    float64
tmin degC    float64
af days      float64
rain mm      float64
sun hours    float64
Latitude     float64
Longitude    float64
dtype: object

In [14]:
#Filling in the NAN values
df["tmax degC"].fillna(df["tmax degC"].mean().round(1), inplace=True)
df["tmin degC"].fillna(df["tmin degC"].mean().round(1), inplace=True)
df["af days"].fillna(df["af days"].mean().round(), inplace=True)
df["rain mm"].fillna(df["rain mm"].mean().round(1), inplace=True)
df["sun hours"].fillna(df["sun hours"].mean().round(1), inplace=True)

In [15]:
#Checking if all NaN values have been filled

df.isna().sum()

Station      0
yyyy         0
mm           0
tmax degC    0
tmin degC    0
af days      0
rain mm      0
sun hours    0
Latitude     0
Longitude    0
dtype: int64

In [16]:
df.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
0,Aberporth,1941,1,12.8,6.0,3.0,74.7,118.6,50.218,-4.57
1,Aberporth,1941,2,12.8,6.0,3.0,69.1,118.6,50.218,-4.57
2,Aberporth,1941,3,12.8,6.0,3.0,76.2,118.6,50.218,-4.57
3,Aberporth,1941,4,12.8,6.0,3.0,33.7,118.6,50.218,-4.57
4,Aberporth,1941,5,12.8,6.0,3.0,51.3,118.6,50.218,-4.57


#### Splitting the cleaned dataset into two:
    1. To hold the training/test data
    2. To hold the last five stations that will act as independent test data

In [17]:
#Variable to store the last 5 unique station names alphabetically
last_five_stations = sorted(df["Station"].unique())[-5:]

#Splitting into two datasets
df_last_five = df[df["Station"].isin(last_five_stations)]
df_main = df[~df["Station"].isin(last_five_stations)]

#Reset indices
df_last_five = df_last_five.reset_index(drop=True)
df_main= df_main.reset_index(drop=True)

In [18]:
#Verifying accuracy of the number of rows

print(len(df))
print(len(df_main))
print(len(df_last_five))

39263
34432
4831


### Modelling

In [19]:
df_main.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude
0,Aberporth,1941,1,12.8,6.0,3.0,74.7,118.6,50.218,-4.57
1,Aberporth,1941,2,12.8,6.0,3.0,69.1,118.6,50.218,-4.57
2,Aberporth,1941,3,12.8,6.0,3.0,76.2,118.6,50.218,-4.57
3,Aberporth,1941,4,12.8,6.0,3.0,33.7,118.6,50.218,-4.57
4,Aberporth,1941,5,12.8,6.0,3.0,51.3,118.6,50.218,-4.57


In [20]:
#Creating target categories for the data -- Northern Third of the UK, Central Third of the UK or Southern Third of the UK.
#The most northerly point has latitude 60.9 and the most southerly point has latitude 49.9.

In [21]:
df_main['Latitude'].unique()

array([50.218, 50.762, 50.779, 50.898, 51.006, 51.089, 51.346, 51.479,
       51.488, 51.761, 51.911, 52.139, 52.245, 52.358, 52.483, 52.794,
       52.833, 53.175, 53.252, 53.356, 53.381, 53.813, 54.352, 54.481,
       54.67 , 54.768, 55.181, 55.311, 55.846, 56.377, 56.451, 56.5  ,
       57.006, 57.593])

### Region Key

Northern Third UK = 0;
Central Third UK = 1;
Southern Third UK = 2

In [22]:
#Function to create categories based on latitude

def classify_latitude(lat):
    if lat > 56:
        return 0
    elif lat >= 53 and lat < 56:
        return 1
    else:
        return 2

In [23]:
#Appending the created regions to the dataset

df_main['Regions'] = df_main['Latitude'].apply(classify_latitude)
df_main.head()

Unnamed: 0,Station,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude,Regions
0,Aberporth,1941,1,12.8,6.0,3.0,74.7,118.6,50.218,-4.57,2
1,Aberporth,1941,2,12.8,6.0,3.0,69.1,118.6,50.218,-4.57,2
2,Aberporth,1941,3,12.8,6.0,3.0,76.2,118.6,50.218,-4.57,2
3,Aberporth,1941,4,12.8,6.0,3.0,33.7,118.6,50.218,-4.57,2
4,Aberporth,1941,5,12.8,6.0,3.0,51.3,118.6,50.218,-4.57,2


In [24]:
#For modelling the station column is not relevant
df_main.drop(columns='Station',inplace=True)
df_main.head()

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Latitude,Longitude,Regions
0,1941,1,12.8,6.0,3.0,74.7,118.6,50.218,-4.57,2
1,1941,2,12.8,6.0,3.0,69.1,118.6,50.218,-4.57,2
2,1941,3,12.8,6.0,3.0,76.2,118.6,50.218,-4.57,2
3,1941,4,12.8,6.0,3.0,33.7,118.6,50.218,-4.57,2
4,1941,5,12.8,6.0,3.0,51.3,118.6,50.218,-4.57,2


In [25]:
#Value distribution accross regions

df_main['Regions'].value_counts()

2    16474
1    14047
0     3911
Name: Regions, dtype: int64

In [26]:
#Separating features and target values

features = df_main.drop(columns='Regions')
target =  df_main['Regions']

Including latitude/longitude introduces data leakage since the goal is to classify regions from weather conditions only.

In [27]:
features.drop(columns=['Latitude','Longitude'],inplace=True)

features.head()

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
0,1941,1,12.8,6.0,3.0,74.7,118.6
1,1941,2,12.8,6.0,3.0,69.1,118.6
2,1941,3,12.8,6.0,3.0,76.2,118.6
3,1941,4,12.8,6.0,3.0,33.7,118.6
4,1941,5,12.8,6.0,3.0,51.3,118.6


In [29]:
target.head()

0    2
1    2
2    2
3    2
4    2
Name: Regions, dtype: int64

In [30]:
#Splitting the data for modelling

X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)

In [31]:
#Model Initiation

model = RandomForestClassifier(random_state=0)
model

In [32]:
#Model Fitting

model.fit(X_train,y_train)

In [33]:
#Model Prediction

pred = model.predict(X_test)
pred

array([2, 1, 1, ..., 2, 1, 1], dtype=int64)

### Model Evaluation

In [34]:
#Weighted average was chosen because of class imbalances
print(f'Accuracy: {accuracy_score(y_test,pred).round(2)}')
print(f'Recall: {recall_score(y_test,pred,average="weighted").round(2)}')
print(f'Precision: {precision_score(y_test,pred,average="weighted").round(2)}')
print(f'F1 Score: {f1_score(y_test,pred,average="weighted").round(2)}')

Accuracy: 0.62
Recall: 0.62
Precision: 0.63
F1 Score: 0.62


### Predictions for the last five stations

In [35]:
#Removing the unwanted columns so that the data can match the testing data (X_test)

df_last_five.drop(columns=['Station','Latitude','Longitude'],inplace=True)
df_last_five.head()

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
0,1930,12,8.6,5.5,0.0,130.3,31.5
1,1931,1,8.0,4.2,0.0,66.2,63.8
2,1931,2,7.6,3.7,0.0,60.6,74.1
3,1931,3,8.5,2.4,6.0,32.5,130.3
4,1931,4,9.9,5.6,0.0,47.1,119.0


In [36]:
last_pred = model.predict(df_last_five)

In [37]:
#Updating the dataframe with the prediction results

df_last_five['Regions']=last_pred
df_last_five.head()

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,Regions
0,1930,12,8.6,5.5,0.0,130.3,31.5,2
1,1931,1,8.0,4.2,0.0,66.2,63.8,1
2,1931,2,7.6,3.7,0.0,60.6,74.1,1
3,1931,3,8.5,2.4,6.0,32.5,130.3,1
4,1931,4,9.9,5.6,0.0,47.1,119.0,0
