In [None]:
# Install all the required libraries
%pip install pandas
%pip install scikit-learn
%pip install request
%pip install matplotlib
%pip install numpy
%pip install joblib

In [None]:
import pandas as pd

# Read the csv file
df = pd.read_csv('../datasets/weather.csv')

In [None]:
# Show quantity of each weather
print(df['Weather'].value_counts())

In [None]:
# show a resumen of dataframe
df.head()

In [None]:
# delete the column 'Date/Time'
df = df.drop(columns=['Date/Time'])


In [None]:
# Rename the columns
df = df.rename(columns={'Temp_C': 'Temp', 
                        'Dew Point Temp_C': 'DewPoint', 
                        'Rel Hum_%': 'Humidity', 
                        'Wind Speed_km/h': 'WindSpeed', 
                        'Visibility_km': 'Visibility', 
                        'Press_kPa': 'Pressure'})

# show columns
df.columns


In [None]:
from pprint import pprint

# Get the max and min of each column
ls = ['Temp', 'DewPoint', 'Humidity', 'WindSpeed', 'Visibility', 'Pressure']
data = {}

for i in ls:
    max = df[i].max()
    min = df[i].min()

    data[i] = {
        'max': max,
        'min': min
    }

pprint(data)

In [None]:
# assign the values of the variables X and Y
X = df.drop('Weather', axis=1).values

Y = df['Weather'].values

In [None]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2,
                                                    random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

# Create a model and train it with logistic regression
from sklearn.linear_model import LogisticRegression

# Create a pipeline
p = make_pipeline(SimpleImputer(strategy='mean'),
                  StandardScaler(),
                  MinMaxScaler(), 
                  LogisticRegression(max_iter=1000, n_jobs=3, verbose=0, solver='lbfgs'))

# Train the model
p.fit(X_train, Y_train)



In [None]:
# Make predictions
Y_pred = p.predict([X_test[0]])


# Show the prediction
print(Y_pred, Y_test[0])

In [None]:
# check accuracy of model
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score

print("Accuracy Score",accuracy_score(Y_test, p.predict(X_test)))

print("Confusion Matrix: ",confusion_matrix(Y_test, p.predict(X_test)))

print("F1 Score:", f1_score(Y_test, p.predict(X_test), average='weighted'))


In [None]:
# grid search

from sklearn.model_selection import GridSearchCV


# Create a pipeline
p = make_pipeline(SimpleImputer(strategy='mean'),
                  StandardScaler(),
                  MinMaxScaler(), 
                  LogisticRegression(max_iter=1000, n_jobs=3, verbose=0, solver='lbfgs'))

# Create a dictionary with the hyperparameters

param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__solver': ['liblinear', 'lbfgs']
}

# Create a grid search

cv = GridSearchCV(p, param_grid, n_jobs=3, verbose=0, cv=5)
cv.fit(X_train, Y_train)

# Show the best hyperparameters
pprint(cv.best_params_)

In [None]:
# use the best model
model = LogisticRegression(C=10, penalty='l2', solver='lbfgs', max_iter=1000, n_jobs=3, verbose=0)
p = make_pipeline(SimpleImputer(strategy='mean'),
                  StandardScaler(),
                  MinMaxScaler(), 
                  model)

p.fit(X_train, Y_train)


In [None]:
# export the model
import joblib

joblib.dump(p, 'weather.pkl')