In [23]:
import pickle
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score

In [24]:
df = pd.read_csv("../datasets/weatherAUS.csv", sep=',')

In [25]:
round(df.isna().mean() * 100, 2)

Date              0.00
Location          0.00
MinTemp           1.02
MaxTemp           0.87
Rainfall          2.24
Evaporation      43.17
Sunshine         48.01
WindGustDir       7.10
WindGustSpeed     7.06
WindDir9am        7.26
WindDir3pm        2.91
WindSpeed9am      1.21
WindSpeed3pm      2.11
Humidity9am       1.82
Humidity3pm       3.10
Pressure9am      10.36
Pressure3pm      10.33
Cloud9am         38.42
Cloud3pm         40.81
Temp9am           1.21
Temp3pm           2.48
RainToday         2.24
RainTomorrow      2.25
dtype: float64

In [26]:
df = df.drop(columns=["Date", "Location", "Evaporation", "Sunshine", "Cloud9am", "Cloud3pm", "WindGustDir", "WindDir9am", "WindDir3pm"])

In [27]:
df.dropna(inplace=True)

In [28]:
round(df.isna().mean() * 100, 2)

MinTemp          0.0
MaxTemp          0.0
Rainfall         0.0
WindGustSpeed    0.0
WindSpeed9am     0.0
WindSpeed3pm     0.0
Humidity9am      0.0
Humidity3pm      0.0
Pressure9am      0.0
Pressure3pm      0.0
Temp9am          0.0
Temp3pm          0.0
RainToday        0.0
RainTomorrow     0.0
dtype: float64

In [29]:
class_maps = {'No': 0, 'Yes':1}
df.RainToday = df.RainToday.map(class_maps)
df.RainTomorrow = df.RainTomorrow.map(class_maps)

In [30]:
df.head

<bound method NDFrame.head of         MinTemp  MaxTemp  Rainfall  WindGustSpeed  WindSpeed9am  WindSpeed3pm  \
0          13.4     22.9       0.6           44.0          20.0          24.0   
1           7.4     25.1       0.0           44.0           4.0          22.0   
2          12.9     25.7       0.0           46.0          19.0          26.0   
3           9.2     28.0       0.0           24.0          11.0           9.0   
4          17.5     32.3       1.0           41.0           7.0          20.0   
...         ...      ...       ...            ...           ...           ...   
145454      3.5     21.8       0.0           31.0          15.0          13.0   
145455      2.8     23.4       0.0           31.0          13.0          11.0   
145456      3.6     25.3       0.0           22.0          13.0           9.0   
145457      5.4     26.9       0.0           37.0           9.0           9.0   
145458      7.8     27.0       0.0           28.0          13.0           7.0  

In [31]:
train_data, test_data = train_test_split(df, test_size=0.2)

In [32]:
train_data.shape

(95672, 14)

In [33]:
test_data.shape

(23918, 14)

In [34]:
# train_data.to_csv('../datasets/weatherAUS-train.csv', index=False)
# test_data.to_csv('../datasets/weatherAUS-test.csv', index=False)

In [35]:
X_train = train_data.drop(columns="RainTomorrow")
y_train = train_data["RainTomorrow"]

X_test = test_data.drop(columns="RainTomorrow")
y_test = test_data["RainTomorrow"]

In [36]:
clf = LinearSVC()

In [37]:
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)

In [38]:
clf = OneVsRestClassifier(clf).fit(X_train, y_train)

In [39]:
clf.score(X_test, y_test)

0.844844886696212

In [41]:
y_pred = clf.predict(X_test)

In [42]:
accuracy = loaded_model.score(X_test, y_test)
f1Score = f1_score(y_test, y_pred)
areaUnderCurve = roc_auc_score(y_test, y_pred)

In [43]:
print(f"************************************************************")
print(f"*       Accuracy: {accuracy*100:.2f}%")
print(f"*       F1Score:  {f1Score*100:.2f}%")
print(f"*       Area Under Curve:      {areaUnderCurve*100:.2f}%")
print(f"************************************************************")

************************************************************
*       Accuracy: 84.48%
*       F1Score:  53.85%
*       Area Under Curve:      69.03%
************************************************************
