In [45]:
import numpy as np
import pandas as pd
import time
import copy
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
rain_data = pd.read_pickle('rain_data_aed.pkl')
rain_data.head()

Unnamed: 0,date,location,mintemp,rainfall,humidity9am,humidity3pm,cloud9am,cloud3pm,raintoday,raintomorrow,winddir3pm,winddir9am,windgustdir,windgustspeed,windspeed3pm,windspeed9am
0,2008-12-01,Albury,13.4,0.6,71.0,22.0,8.0,4.0,0.0,0,WNW,W,W,44.0,24.0,20.0
1,2008-12-02,Albury,7.4,0.0,44.0,25.0,5.636364,4.0,0.0,0,WSW,NNW,WNW,44.0,22.0,4.0
2,2008-12-03,Albury,12.9,0.0,38.0,30.0,5.636364,2.0,0.0,0,WSW,W,WSW,46.0,26.0,19.0
3,2008-12-04,Albury,9.2,0.0,45.0,16.0,5.636364,4.0,0.0,0,E,SE,NE,24.0,9.0,11.0
4,2008-12-05,Albury,17.5,1.0,82.0,33.0,7.0,8.0,0.0,0,NW,ENE,W,41.0,20.0,7.0


In [3]:
rain_data.columns

Index(['date', 'location', 'mintemp', 'rainfall', 'humidity9am', 'humidity3pm',
       'cloud9am', 'cloud3pm', 'raintoday', 'raintomorrow', 'winddir3pm',
       'winddir9am', 'windgustdir', 'windgustspeed', 'windspeed3pm',
       'windspeed9am'],
      dtype='object')

In [4]:
target_variable = rain_data['raintomorrow']

# Variáveis Categóricas

In [5]:
categorical=list(rain_data.columns[rain_data.dtypes == 'object'])
categorical

['location', 'winddir3pm', 'winddir9am', 'windgustdir']

In [6]:
sum([rain_data[col].nunique() for col in categorical[1:]])

48

### Não vou usar o método de Dummy Variables por que seriam criadas 48 novas colunas, que é um número grande para a nossa quantidade de amostras. 

### Em vez disso usarei Hashing, onde serão criadas 24 novas colunas

In [7]:
for col in categorical[1:]:
    fh = FeatureHasher(n_features=8, input_type='string')
    hashed_features = fh.fit_transform(rain_data[col])
    hashed_features = hashed_features.toarray()
    
    names = [col + '_' +str(i) for i in range(8)]
    
    rain_data=pd.concat([rain_data, pd.DataFrame(hashed_features,columns=names)], axis=1)

In [8]:
rain_data.head()

Unnamed: 0,date,location,mintemp,rainfall,humidity9am,humidity3pm,cloud9am,cloud3pm,raintoday,raintomorrow,...,winddir9am_6,winddir9am_7,windgustdir_0,windgustdir_1,windgustdir_2,windgustdir_3,windgustdir_4,windgustdir_5,windgustdir_6,windgustdir_7
0,2008-12-01,Albury,13.4,0.6,71.0,22.0,8.0,4.0,0.0,0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2008-12-02,Albury,7.4,0.0,44.0,25.0,5.636364,4.0,0.0,0,...,0.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
2,2008-12-03,Albury,12.9,0.0,38.0,30.0,5.636364,2.0,0.0,0,...,0.0,0.0,-2.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
3,2008-12-04,Albury,9.2,0.0,45.0,16.0,5.636364,4.0,0.0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2008-12-05,Albury,17.5,1.0,82.0,33.0,7.0,8.0,0.0,0,...,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
rain_data.columns

Index(['date', 'location', 'mintemp', 'rainfall', 'humidity9am', 'humidity3pm',
       'cloud9am', 'cloud3pm', 'raintoday', 'raintomorrow', 'winddir3pm',
       'winddir9am', 'windgustdir', 'windgustspeed', 'windspeed3pm',
       'windspeed9am', 'winddir3pm_0', 'winddir3pm_1', 'winddir3pm_2',
       'winddir3pm_3', 'winddir3pm_4', 'winddir3pm_5', 'winddir3pm_6',
       'winddir3pm_7', 'winddir9am_0', 'winddir9am_1', 'winddir9am_2',
       'winddir9am_3', 'winddir9am_4', 'winddir9am_5', 'winddir9am_6',
       'winddir9am_7', 'windgustdir_0', 'windgustdir_1', 'windgustdir_2',
       'windgustdir_3', 'windgustdir_4', 'windgustdir_5', 'windgustdir_6',
       'windgustdir_7'],
      dtype='object')

In [10]:
rain_data.drop(['windgustdir','winddir9am','winddir3pm','raintomorrow'],axis=1,inplace=True)

In [11]:
rain_data.drop(['location','date'],axis=1,inplace=True)

In [13]:
rain_data.dtypes

mintemp          float64
rainfall         float64
humidity9am      float64
humidity3pm      float64
cloud9am         float64
cloud3pm         float64
raintoday        float64
windgustspeed    float64
windspeed3pm     float64
windspeed9am     float64
winddir3pm_0     float64
winddir3pm_1     float64
winddir3pm_2     float64
winddir3pm_3     float64
winddir3pm_4     float64
winddir3pm_5     float64
winddir3pm_6     float64
winddir3pm_7     float64
winddir9am_0     float64
winddir9am_1     float64
winddir9am_2     float64
winddir9am_3     float64
winddir9am_4     float64
winddir9am_5     float64
winddir9am_6     float64
winddir9am_7     float64
windgustdir_0    float64
windgustdir_1    float64
windgustdir_2    float64
windgustdir_3    float64
windgustdir_4    float64
windgustdir_5    float64
windgustdir_6    float64
windgustdir_7    float64
dtype: object

## Normalizando os dados

### Usarei o min-max scalar e deixarei todas as variáveis entre 0 e 1

In [14]:
scaler=MinMaxScaler(feature_range=(0, 1),copy=False)
normalized_data = scaler.fit_transform(rain_data.to_numpy())
rain_data.at[:] = normalized_data

In [15]:
rain_data.head()

Unnamed: 0,mintemp,rainfall,humidity9am,humidity3pm,cloud9am,cloud3pm,raintoday,windgustspeed,windspeed3pm,windspeed9am,...,winddir9am_6,winddir9am_7,windgustdir_0,windgustdir_1,windgustdir_2,windgustdir_3,windgustdir_4,windgustdir_5,windgustdir_6,windgustdir_7
0,0.516509,0.001617,0.71,0.22,0.888889,0.444444,0.0,0.294574,0.275862,0.153846,...,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.5
1,0.375,0.0,0.44,0.25,0.626263,0.444444,0.0,0.294574,0.252874,0.030769,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25
2,0.504717,0.0,0.38,0.3,0.626263,0.222222,0.0,0.310078,0.298851,0.146154,...,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5
3,0.417453,0.0,0.45,0.16,0.626263,0.444444,0.0,0.139535,0.103448,0.084615,...,0.0,0.75,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5
4,0.613208,0.002695,0.82,0.33,0.777778,0.888889,0.0,0.271318,0.229885,0.053846,...,0.0,0.75,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.5


# Modelos

In [16]:
target_variable.value_counts(normalize=True)

0    0.775819
1    0.224181
Name: raintomorrow, dtype: float64

### Aproximadamente 77% dos dados pertencem à classe 0 (não choveu).
### Podemos ver que a diferença entre a quantidade de cada classe é alta. Dado isso, usarei a técnica stratified k-fold para treinar os modelos

In [20]:
dict_classifiers = {"Floresta Aleatória": RandomForestClassifier(n_estimators=100),
    "Vizinhos mais próximos": KNeighborsClassifier(),
    "SVM": SVC(gamma = "scale"),
    "Árvore de Decisão": DecisionTreeClassifier(),
    "Rede Neural": MLPClassifier(max_iter = 1000),
    "Naive Bayes": GaussianNB(),
    "Regressão Logística": LogisticRegression(solver = "newton-cg", max_iter = 1000,multi_class="auto"),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [21]:
classifiers_names = list(dict_classifiers.keys())

In [22]:
classifiers_values=list(dict_classifiers.values())

In [23]:
def train_model(model,X_train,y_train,X_test,y_test):
    t0 = time.time()
    model.fit(X_train,y_train)
    tf = time.time()
    times=tf-t0
    print('Treinou em ',times,'segundos')
    prec = float("{0:.4f}".format(model.score(X_test,y_test)))
    print("Precisao = ",prec)
    
    return prec,times

### A célula abaixo demora aproximadamente 2 horas pra rodar. Se quiser diminuir consideravelmente o tempo, vá no dict_classifiers e comente a rede neural e o SVM.

In [None]:
skf = StratifiedKFold(n_splits=10)
mean_precs = []
i = 0
size = len(dict_classifiers)
folds_times = [[0]*size]*10
folds_precs = [[0]*size]*10

for train_index, test_index in skf.split(rain_data, target_variable):
    times,precs = [], []
    print('\nFOLD ',i+1,'\n')
    X_train, X_test = rain_data.loc[train_index], rain_data.loc[test_index]
    y_train, y_test = target_variable.loc[train_index], target_variable.loc[test_index]
    
    for key,value in zip(classifiers_names,classifiers_values):    
        print('Treinando',key)
        Prec,Time=train_model(value,X_train,y_train,X_test,y_test)
        times.append(Time)
        precs.append(Prec)
        
    meanT = np.mean(times)
    meanP = np.mean(precs)
    mean_precs.append(meanP)
    print("\nPrecisões dos classificadores no fold {} = {} ".format(i+1,precs))
    print("\nTempos dos classificadores no fold {} = {} ".format(i+1,times))
    print("\nPrecisao média dos classificadores no fold {} = {} ".format(i+1,meanP))
    #print("\nTempo médio dos classificadores no fold {} = {} ".format(i+1,meanT))
    
    folds_times[i] = times             #linha = fold,coluna = classificador,dado = tempo
    folds_precs[i] = precs
    i+=1

### Caso não queira executar a célula acima, execute a célula abaixo.

In [24]:
folds_precs = [[0.848 , 0.8196, 0.8421, 0.7876, 0.8497, 0.8162, 0.8459, 0.8459],
       [0.8259, 0.7797, 0.825 , 0.7245, 0.8212, 0.7932, 0.8146, 0.8255],
       [0.8429, 0.804 , 0.839 , 0.7511, 0.8391, 0.7956, 0.8322, 0.8409],
       [0.8202, 0.7949, 0.8284, 0.7251, 0.8205, 0.7759, 0.8189, 0.8195],
       [0.8364, 0.7932, 0.836 , 0.7566, 0.8335, 0.7787, 0.8341, 0.8354],
       [0.8439, 0.7968, 0.8455, 0.7406, 0.8414, 0.8002, 0.8448, 0.8465],
       [0.8331, 0.7996, 0.8293, 0.7534, 0.8351, 0.7948, 0.8325, 0.8336],
       [0.8463, 0.8205, 0.8396, 0.7844, 0.8459, 0.833 , 0.8414, 0.8474],
       [0.8416, 0.8167, 0.8413, 0.776 , 0.8465, 0.8256, 0.8446, 0.8444],
       [0.8484, 0.8166, 0.8446, 0.7671, 0.8536, 0.8285, 0.851 , 0.8545]]

folds_times = [[1.83669181e+01, 8.94497561e+00, 5.90386655e+02, 1.31317735e+00,
        2.74643789e+02, 1.19001627e-01, 2.18498802e+00, 2.24003823e+01],
       [1.80840991e+01, 1.07430053e+01, 5.59726035e+02, 9.92673874e-01,
        2.67255739e+02, 1.25016928e-01, 2.30605078e+00, 2.38303096e+01],
       [1.83022604e+01, 1.16543615e+01, 5.67408328e+02, 9.85673666e-01,
        2.55052559e+02, 1.15962267e-01, 1.79041123e+00, 1.90057604e+01],
       [1.51590223e+01, 1.92096934e+01, 5.33634631e+02, 1.09664559e+00,
        2.45488993e+02, 1.09009027e-01, 1.92912745e+00, 2.07467797e+01],
       [1.68769615e+01, 1.23524561e+01, 5.78788430e+02, 1.12067866e+00,
        2.67825186e+02, 1.14025593e-01, 1.98997474e+00, 2.14275129e+01],
       [1.71924546e+01, 1.06997895e+01, 5.77352443e+02, 1.10163617e+00,
        3.39242182e+02, 1.12964153e-01, 1.94036269e+00, 1.90557442e+01],
       [1.57103431e+01, 2.47100694e+01, 5.69047809e+02, 1.12905145e+00,
        2.47207261e+02, 1.06964827e-01, 1.74442720e+00, 1.96675427e+01],
       [1.53409631e+01, 1.00267062e+01, 5.30411877e+02, 1.11517572e+00,
        1.20664882e+02, 1.17008924e-01, 2.03301072e+00, 2.22778318e+01],
       [1.71458364e+01, 1.01467972e+01, 5.66068795e+02, 9.96672153e-01,
        2.32205997e+02, 1.16009474e-01, 2.20248985e+00, 2.21956556e+01],
       [1.76285684e+01, 1.09199631e+01, 5.81281899e+02, 1.11923933e+00,
        2.22472934e+02, 1.06965542e-01, 1.87138534e+00, 1.97745094e+01]]

In [31]:
means_time, means_prec = [],[]
folds_times = np.array(folds_times)
folds_precs = np.array(folds_precs)
size = len(dict_classifiers)
for i,j in zip(list(range(size)),classifiers_names):
    folds_mean_time = np.mean(folds_times[:,i])
    means_time.append(folds_mean_time)
    
    folds_mean_prec = np.mean(folds_precs[:,i])
    means_prec.append(folds_mean_prec)
    
    #print('\n Tempo médio do classificador {} nos folds = {} '.format(j,folds_mean_time))
    #print('\n Precisão média do classificador {} nos folds = {} '.format(j,folds_mean_prec))
    
means_time=np.round(means_time,3)
means_prec=np.round(means_prec,3)
data_times = pd.DataFrame(index=classifiers_names,data=np.transpose([means_time,means_prec]),columns=['Tempo Médio(seg)','Precisão Média'])
data_times

Unnamed: 0,Tempo Médio(seg),Precisão Média
Floresta Aleatória,16.981,0.839
Vizinhos mais próximos,12.941,0.804
SVM,565.411,0.837
Árvore de Decisão,1.097,0.757
Rede Neural,247.206,0.839
Naive Bayes,0.114,0.804
Regressão Logística,1.999,0.836
Gradient Boosting,21.038,0.839


In [74]:
fig = go.Figure([go.Bar(y=data_times["Tempo Médio(seg)"],x = classifiers_names,marker_color='blue',text=data_times["Tempo Médio(seg)"],textposition='auto')])
fig.update_layout(title_text='Médias de Tempos de Classificadores',xaxis_title="Classificador",yaxis_title="Segundo",xaxis_tickangle=-45)
fig.show()

### Sem SVM e Rede Neural

In [55]:
tempos = data_times["Tempo Médio(seg)"].values.copy()
tempos=np.delete(tempos,[2,4])
names = np.array(classifiers_names)
names=np.delete(names,[2,4])

In [71]:

fig = go.Figure([go.Bar(y=tempos,x = names,marker_color='blue',text=tempos,textposition='auto')])
fig.update_layout(title_text='Médias de Tempos de Classificadores',xaxis_title="Classificador",yaxis_title="Segundo",xaxis_tickangle=-45)
fig.show()

In [43]:
lista1 = [1,2,3,4]
lista2 = [9,9,9,9]

In [46]:
lista1=copy.copy(lista2)
del(lista1[3])
lista2

[9, 9, 9, 9]

In [70]:
fig = go.Figure([go.Bar(y=data_times["Precisão Média"],x = classifiers_names,marker_color='red',text=data_times["Precisão Média"],textposition='auto')]) 
fig.update_layout(title_text='Precisão média de Classificadores',xaxis_title="Classificador",yaxis_title="Precisão",xaxis_tickangle=-45)
fig.show()

### Sem SVM e Rede Neural

In [63]:
precisoes = data_times["Precisão Média"].values.copy()
precisoes=np.delete(precisoes,[2,4])

In [68]:
fig = go.Figure([go.Bar(y=precisoes,x = names,marker_color='red',text=precisoes,textposition='auto')]) 
fig.update_layout(title_text='Precisão média de Classificadores',xaxis_title="Classificador",yaxis_title="Precisão",xaxis_tickangle=-45)
fig.show()