# Walmart Dataset
**Fonte Dados:** [Kaggle](https://www.kaggle.com/datasets/yasserh/walmart-dataset/data)

In [2]:
# Importar biblio e dados
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

dados = pd.read_csv("Walmart.csv")
dados

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [None]:
dados.info()

In [None]:
dados.describe()

In [None]:
dados.isna().sum()

In [None]:
dados.nunique()

In [None]:
# Alterar tipo da coluna "Date" para Data

dados['Date'] = pd.to_datetime(dados['Date'], format="%d-%m-%Y")
dados

In [None]:
dados.info()

In [None]:
# Novas colunas "Year" e "Month"

dados['Year'] = dados['Date'].dt.year
dados['Month'] = dados['Date'].dt.month
dados

In [None]:
# Eliminar coluna 'Date'
dados = dados.drop('Date', axis=1)
dados.info()

In [None]:
dados.duplicated().sum()

In [None]:
cols = ['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month']
plt.figure(figsize=(20,18))
for i,col in enumerate(cols):
    print(i, col)
    plt.subplot(4,2,i+1)
    sns.boxplot(dados, x = col, color = 'red')
plt.show()

In [None]:
print(dados['Temperature'].describe())
print(dados['Unemployment'].describe())

In [13]:
# Remover Outliers das colunas "Temperature" e "Unemployment"

dados.drop(dados[dados['Temperature'] < 7.5].index, axis = 0, inplace = True)

dados.drop(dados[dados['Unemployment'] < 4.4].index, axis = 0, inplace = True)
dados.drop(dados[dados['Unemployment'] > 10.9].index, axis = 0, inplace = True)

In [None]:
sns.heatmap(dados.corr(), annot = True, cmap = 'coolwarm', fmt = '.2f', 
            linewidths = .5)

In [None]:
dados.corr()['Weekly_Sales']

In [16]:
#  5 melhores:   Store, Temperature, CPI, Unemployment, Month

In [17]:
# Separar X e y

X = dados.drop('Weekly_Sales', axis=1)
y = dados["Weekly_Sales"]

In [18]:
# Treino teste & split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
# Importação modelos

from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import HuberRegressor, LinearRegression

In [20]:
# Classificadores e fit

reg_rlinear = linear_model.LinearRegression().fit(X_train, y_train)
reg_knn_3 = KNeighborsRegressor(n_neighbors=3).fit(X_train, y_train)
reg_knn_5 = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)
reg_tree = tree.DecisionTreeRegressor().fit(X_train, y_train)
reg_RKNN = RadiusNeighborsRegressor(radius = 800).fit(X_train, y_train)
reg_rforest = RandomForestRegressor(n_estimators=800, random_state=5).fit(X_train,y_train)
reg_Hub = HuberRegressor(max_iter = 800).fit(X_train, y_train)

In [21]:
# Fazer a previsão

y_pred_RL = reg_rlinear.predict(X_test)
y_pred_knn_3 = reg_knn_3.predict(X_test)
y_pred_knn_5 = reg_knn_5.predict(X_test)
y_pred_tree = reg_tree.predict(X_test)
y_pred_rknn = reg_RKNN.predict(X_test)
y_pred_rforest = reg_rforest.predict(X_test)
y_pred_hub = reg_Hub.predict(X_test)

In [None]:
y_test.mean()

In [None]:
# Avaliar a media do erro absoluto

from sklearn.metrics import mean_absolute_error
print('Média do erro absoluto - Reg. Linear:', mean_absolute_error(y_test, y_pred_RL))
print('Média do erro absoluto - KNN_3:', mean_absolute_error(y_test, y_pred_knn_3))
print('Média do erro absoluto - KNN_5:', mean_absolute_error(y_test, y_pred_knn_5))
print('Média do erro absoluto - Árvore:', mean_absolute_error(y_test, y_pred_tree))
print('Média do erro absoluto - KNN_radius:', mean_absolute_error(y_test, y_pred_rknn))
print('Média do erro absoluto - Random Forest:', mean_absolute_error(y_test, y_pred_rforest))
print('Média do erro absoluto - Huber Regressor:', mean_absolute_error(y_test, y_pred_hub))

In [None]:
# Percentagem dos erros de cada modelo

print("Percentagem Reg. Linear:", round(((mean_absolute_error(y_test, y_pred_RL)/y_test.mean())*100),1),'%')
print("Percentagem KNN_3:", round(((mean_absolute_error(y_test, y_pred_knn_3)/y_test.mean())*100),1),'%')
print("Percentagem KNN_5:", round(((mean_absolute_error(y_test, y_pred_knn_5)/y_test.mean())*100),1),'%')
print("Percentagem Árvore:", round(((mean_absolute_error(y_test, y_pred_tree)/y_test.mean())*100),1),'%')
print("Percentagem KNN_radius:", round(((mean_absolute_error(y_test, y_pred_rknn)/y_test.mean())*100),1),'%')
print("Percentagem Random Forest:", round(((mean_absolute_error(y_test, y_pred_rforest)/y_test.mean())*100),1),'%')
print("Percentagem Huber Regressor:", round(((mean_absolute_error(y_test, y_pred_hub)/y_test.mean())*100),1),'%')

In [None]:
# Ver graficamente

fig,ax=plt.subplots(ncols=7, figsize=(20,10))
ax[0].scatter(y_pred_RL,y_test)
ax[0].plot([0,3000000],[0,4000000],'--r')
ax[1].scatter(y_pred_knn_3,y_test)
ax[1].plot([0,3000000],[0,4000000],'--r')
ax[2].scatter(y_pred_knn_5,y_test)
ax[2].plot([0,3000000],[0,4000000],'--r')
ax[3].scatter(y_pred_tree,y_test)
ax[3].plot([0,3000000],[0,4000000],'--r')
ax[4].scatter(y_pred_rknn,y_test)
ax[4].plot([0,3000000],[0,4000000],'--r')
ax[5].scatter(y_pred_rforest,y_test)
ax[5].plot([0,3000000],[0,4000000],'--r')
ax[6].scatter(y_pred_hub,y_test)
ax[6].plot([0,3000000],[0,4000000],'--r')

ax[0].title.set_text('Reg Linear')
ax[1].title.set_text('KNN_3')
ax[2].title.set_text('KNN_5')
ax[3].title.set_text('Tree')
ax[4].title.set_text('KNN_radius')
ax[5].title.set_text('Random Forest')
ax[6].title.set_text('Huber')

plt.show()

In [None]:
modelo = RandomForestRegressor()
modelo.fit(X_train, y_train)

#Obter a importância das características
feature_importances = modelo.feature_importances_

#Criar DataFrame com as importâncias das características
feature_importances_data = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

#Ordenar as características por importância
feature_importances_data = feature_importances_data.sort_values(by = 'Importance', ascending=False)

#Imprimir
print(feature_importances_data)