In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time
from sklearn.metrics import accuracy_score

data = pd.read_csv("weatherAUS.csv")
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


Завантажимо набір даних, що містить приблизно 10 років щоденних спостережень за погодою від численних погодних станцій Австралії

Далі видалимо колонку RISK_MM, яка є непотрібною для даної моделі

In [2]:
data.drop('RISK_MM', axis=1, inplace=True)

Виконаємо попередню обробку даних

In [3]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')

df_num_col = ["MinTemp","MaxTemp","Rainfall","Evaporation","Sunshine",
              "WindGustSpeed","WindSpeed9am","WindSpeed3pm","Humidity9am",
              "Humidity3pm","Pressure9am","Pressure3pm","Cloud9am","Cloud3pm",
              "Temp9am","Temp3pm"]
data_num = data[df_num_col]
imputer = imputer.fit(data_num)
data[df_num_col] = imputer.transform(data_num)

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
df_cat_col = ["WindGustDir","WindDir9am","WindDir3pm","RainToday","RainTomorrow","Date","Location"]

data_cat = data[df_cat_col].fillna('NA')

for i in range(len(data_cat.columns)):
    data_cat.iloc[:,i] = labelencoder.fit_transform(data_cat.iloc[:,i])
data[df_cat_col] = data_cat

# Виокремимо залежну та незалежну змінну від набору даних
x = data.iloc[:,0:22].values
y = data.iloc[:,22].values

Розіб'ємо дані на тренувальні та тестові

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

Виконаємо feature scaling

In [6]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.fit_transform(x_test)

#### Застосуємо кілька моделей класифікації

##### Логістична регресія

In [7]:
from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import accuracy_score

t0 = time.time()
logreg = LogisticRegression(random_state=0)
logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)
score = accuracy_score(y_test,y_pred)
print('Logistic Regression Accuracy :', score)
print('Logistic Regression Time taken :' , time.time() - t0)
score

Logistic Regression Accuracy : 0.8446499525299764
Logistic Regression Time taken : 0.4360315799713135


0.8446499525299764

##### Дерево рішень

In [8]:
from sklearn.tree import DecisionTreeClassifier

t0 = time.time()
destree = DecisionTreeClassifier(random_state=0)
destree.fit(x_train, y_train)
y_pred = destree.predict(x_test)
score = accuracy_score(y_test, y_pred)
print('Decision Tree Accuracy :', score)
print('Decision Tree Time taken :' , time.time() - t0)
score

Decision Tree Accuracy : 0.7891627694363375
Decision Tree Time taken : 1.7690181732177734


0.7891627694363375

##### Random forest алгоритм

In [9]:
from sklearn.ensemble import RandomForestClassifier

t0 = time.time()
rantree = RandomForestClassifier(random_state=0)
rantree.fit(x_train,y_train)
y_pred = rantree.predict(x_test)
score = accuracy_score(y_test,y_pred)
print('Random Forest Accuracy :', score)
print('Random Forest Time taken :' , time.time() - t0)
score

Random Forest Accuracy : 0.8592777523822919
Random Forest Time taken : 24.50630235671997


0.8592777523822919

Висновок: Алгоритм Random Forest показав найкращий результат за точністю, але зайняв набагато більше часу, ніж інші моделі. Логістична регресія показала на 1.5% гіршу точність, але найкращий час, тому вважаємо її оптимальною.