In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, average_precision_score, roc_curve, auc
#from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
# from skopt import BayesSearchCV #Eliminado, porque requiere otra versión de scikit-learn
# from skopt import gp_minimize

In [2]:
data = pd.read_csv("prep_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123710 entries, 0 to 123709
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       123710 non-null  int64  
 1   MinTemp        123710 non-null  float64
 2   MaxTemp        123710 non-null  float64
 3   Rainfall       123710 non-null  float64
 4   Evaporation    123710 non-null  float64
 5   Sunshine       123710 non-null  float64
 6   WindGustDir    123710 non-null  int64  
 7   WindGustSpeed  123710 non-null  float64
 8   WindDir9am     123710 non-null  int64  
 9   WindDir3pm     123710 non-null  int64  
 10  WindSpeed9am   123710 non-null  float64
 11  WindSpeed3pm   123710 non-null  float64
 12  Humidity9am    123710 non-null  float64
 13  Humidity3pm    123710 non-null  float64
 14  Pressure9am    123710 non-null  float64
 15  Pressure3pm    123710 non-null  float64
 16  Cloud9am       123710 non-null  float64
 17  Cloud3pm       123710 non-nul

## Primero, hay que intentar hacer las predicciones con cada uno de los modelos

Lo haremos para descartar los modelos con peores métricas finales y con mayor tiempo de ejecución (en cualquier caso guardamos SVC, porque lo necesitaremos al final).

In [3]:
X = data.drop("RainTomorrow", axis = 1)
y = data["RainTomorrow"]

x_t, x_v, y_t, y_v = train_test_split(X, y, test_size = 0.1, random_state = 23)

MLPClassifier con valores por defecto:

In [4]:
mlpc = MLPClassifier(random_state = 23)
mlpc.fit(x_t, y_t)
predictions = mlpc.predict(x_v)
f1_score(y_v, predictions)

0.6337296935186737

Random Forest con valores por defecto:

In [5]:
rfc = RandomForestClassifier(random_state = 23, n_jobs = -1)
rfc.fit(x_t, y_t)
predictions = rfc.predict(x_v)
f1_score(y_v, predictions)

0.6226959804574728

GradientBoostingClassifier con valores por defecto:

In [6]:
gbc = GradientBoostingClassifier(random_state = 23)
gbc.fit(x_t, y_t)
predictions = gbc.predict(x_v)
f1_score(y_v, predictions)

0.6139004604253453

Decision Tree Classifier con valores por defecto:

In [7]:
dtc = DecisionTreeClassifier(random_state = 23)
dtc.fit(x_t, y_t)
predictions = dtc.predict(x_v)
f1_score(y_v, predictions)

0.5366293756766511

SGD Classifier con valores por defecto:

In [8]:
sgdc = SGDClassifier(random_state = 23)
sgdc.fit(x_t, y_t)
predictions = sgdc.predict(x_v)
f1_score(y_v, predictions)

0.1916364833717484

Gaussian Naïve Bayes con valores por defecto:

In [9]:
gnb = GaussianNB()
gnb.fit(x_t, y_t)
predictions = gnb.predict(x_v)
f1_score(y_v, predictions)

0.5844670225589806

Los resultados obdenidos hacen que descartamos el SGDClassifier. Los demás modelos muestran f1_score mayor que 0.5. Además, el Gaussian NB y Decision Tree