In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Visualização dos dados 

In [None]:
df = pd.read_csv("/kaggle/input/world-air-quality-index-by-city-and-coordinates/AQI and Lat Long of Countries.csv")

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.columns

## Visualização em graficos

In [None]:
fig = px.scatter_mapbox(data_frame=df, lon='lng', lat='lat', color='Country', 
                     width=1000, height=700, zoom = 1,
                title = 'Countries locations')
fig.update_layout(mapbox_style = 'stamen-terrain')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.scatter_mapbox(data_frame=df, lon='lng', lat='lat', color='AQI Category', 
                 width=900, height=650, zoom = 1,
                 color_discrete_sequence=["blue", "green", "yellow", "goldenrod", "orange", "red"],
                title='Air Quality of the regions of countries')

fig.update_layout(mapbox_style = 'stamen-terrain')
fig.update_layout(legend=dict(orientation="h"))
fig.show()

## Tratamento dos dados

### removendo valores nulos

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

### Detectando Outliers

In [None]:
fig_box = px.box(df, y='AQI Value')
fig_box.show()

In [None]:
outliers_AQIValue = df[df['AQI Value'] >= 500]
outliers_AQIValue

In [None]:
fig_box = px.box(df, y='Ozone AQI Value')
fig_box.show()

In [None]:
outliers_Ozone_AQIValue = df[df['Ozone AQI Value'] >= 210]
outliers_Ozone_AQIValue

In [None]:
df.drop(['lat', 'lng'], axis=1, inplace=True)

In [None]:
df

## Separação das Previsores x classes

In [None]:
df['AQI Category'].unique()

In [None]:
X = df.drop('AQI Category', axis=1).values

y = df['AQI Category']

In [None]:
X

In [None]:
X.shape

In [None]:
y

In [None]:
y.shape

## Pré-Processamento

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label = LabelEncoder()

X[:, 0] = label.fit_transform(X[:, 0])
X[:, 1] = label.fit_transform(X[:, 1])
X[:, 4] = label.fit_transform(X[:, 4])
X[:, 6] = label.fit_transform(X[:, 6])
X[:, 8] = label.fit_transform(X[:, 8])
X[:, 10] = label.fit_transform(X[:, 10])

In [None]:
X

## Separação treino teste

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## Treinamento do classificador
- Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Buscando os melhores parametros

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parametros = {'criterion': ['gini','entropy'],
              'n_estimators': [10, 20, 40, 100, 150]}

In [None]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parametros)
grid_search.fit(X_train, y_train)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

### Fit do classficador

In [None]:
# {'criterion': 'gini', 'n_estimators': 10}
clf = RandomForestClassifier(criterion='gini', n_estimators=20, random_state=1)
clf.fit(X_train, y_train)

In [None]:
predict = clf.predict(X_test)
predict

In [None]:
y_test

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predict)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(clf)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)

## Salvamento do classificador

In [None]:
Classificador_AQI_RF = RandomForestClassifier(criterion='gini', n_estimators=20, random_state=1)
Classificador_AQI_RF.fit(X, y)

In [None]:
import pickle

In [None]:
pickle.dump(Classificador_AQI_RF, open("classificador_random_forest.sav", 'wb'))