In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# from sklearn.tree import plot_tree

In [30]:
data = pd.read_csv('data/weather_classification_data.csv')
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [31]:
data.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [32]:
encoder = LabelEncoder()
for col in ['Cloud Cover','Season','Location','Weather Type']:
    data[f'Encoded {col}'] = encoder.fit_transform(data[col])
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type,Encoded Cloud Cover,Encoded Season,Encoded Location,Encoded Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy,3,3,1,1
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy,3,1,1,0
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny,0,1,2,3
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny,0,1,0,3
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy,2,3,2,1


In [33]:
numerical_data = data.drop(labels=["Cloud Cover", "Season", "Location", "Weather Type"], axis = 1)
numerical_data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km),Encoded Cloud Cover,Encoded Season,Encoded Location,Encoded Weather Type
0,14.0,73,9.5,82.0,1010.82,2,3.5,3,3,1,1
1,39.0,96,8.5,71.0,1011.43,7,10.0,3,1,1,0
2,30.0,64,7.0,16.0,1018.72,5,5.5,0,1,2,3
3,38.0,83,1.5,82.0,1026.25,7,1.0,0,1,0,3
4,27.0,74,17.0,66.0,990.67,1,2.5,2,3,2,1


In [34]:
# numerical_data.drop_duplicates()

In [35]:
X = numerical_data.drop(["Encoded Weather Type"],axis = 1)
y = numerical_data["Encoded Weather Type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)


In [36]:
print(X_train.head(), X_test.head())

      Temperature  Humidity  Wind Speed  Precipitation (%)  \
3958         32.0        66         6.0               92.0   
2239         15.0        96        11.0               57.0   
3608         28.0        58         6.5               11.0   
6848         47.0        84        44.5               85.0   
6119         18.0        66        14.0               10.0   

      Atmospheric Pressure  UV Index  Visibility (km)  Encoded Cloud Cover  \
3958               1010.21         2              1.5                    3   
2239               1016.98         0              4.0                    2   
3608               1025.37        10              9.0                    0   
6848                992.26         2              2.0                    2   
6119               1000.04         1              8.0                    3   

      Encoded Season  Encoded Location  
3958               1                 1  
2239               0                 2  
3608               2               

In [37]:
numerical_data.shape

(13200, 11)

In [38]:
X_train.shape, X_test.shape

((10560, 10), (2640, 10))

In [43]:
classifier = RandomForestClassifier(n_estimators=50, random_state=41)
classifier.fit(X_train, y_train)


In [44]:
y_pred = classifier.predict(X_test)

In [45]:
accuracy_score(y_test, y_pred)

0.9102272727272728

In [42]:
# plt.figure(figsize=(10,8))
# plot_tree(classifier)
# plt.show()