In [None]:
import pandas as pd
df_amazon = pd.read_csv("Amazon.csv")
df_amazon.columns


In [None]:
df_amazon['Unnamed: 22'].tail()

In [None]:
df_amazon['Unnamed: 22'].unique()

In [None]:
df_amazon = df_amazon.drop(columns=['Unnamed: 22'])

In [None]:
df_amazon['Sales Channel '].nunique()


In [None]:
df_amazon = df_amazon.drop(columns=['Sales Channel '])

In [None]:
df_amazon = df_amazon.drop(columns=['index'])

In [None]:
df_amazon.head()

In [None]:
print(df_amazon.columns.tolist())


In [None]:
df_amazon['currency'].unique()

In [None]:
df_amazon = df_amazon.drop(columns=['currency'])

In [None]:
df_amazon['fulfilled-by'].unique()

In [None]:
df_amazon = df_amazon.drop(columns=['ship-country'])

In [None]:
df_amazon = df_amazon.drop(columns=['fulfilled-by'])

In [None]:
df_amazon.dtypes

In [None]:
df_amazon.shape

In [None]:
df_amazon.isnull().sum()

In [None]:
#df_amazon = df_amazon.dropna()

In [None]:
df_amazon.isnull().sum()

In [None]:
df_amazon.shape

In [None]:
df_amazon.isnull()

In [None]:
df_amazon.head()

In [None]:
df_amazon.columns

In [None]:
df_amazon.isnull().sum()

In [None]:
df_amazon.shape

In [None]:
df_amazon = df_amazon.drop(columns=['promotion-ids'])

In [None]:
df_amazon.shape

In [None]:
df_amazon.isnull().sum()

In [None]:
df_amazon = df_amazon.dropna()

In [None]:
df_amazon

In [None]:
df_amazon["Date"] = pd.to_datetime(df_amazon["Date"], errors="coerce")

In [None]:
df_amazon.dtypes

In [None]:
df_amazon[df_amazon.duplicated(subset='Order ID', keep=False)]

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

df_amazon['Date'] = pd.to_datetime(df_amazon['Date'])
df_amazon = df_amazon.sort_values(['ship-state', 'ship-city', 'Date']).reset_index(drop=True)
df_amazon['year'] = df_amazon['Date'].dt.year
df_amazon['month'] = df_amazon['Date'].dt.month
df_amazon['amount_lag_1month'] = df_amazon.groupby(['ship-state', 'ship-city'])['Amount'].transform(lambda x: x.shift(30))
df_amazon['amount_lag_2month'] = df_amazon.groupby(['ship-state', 'ship-city'])['Amount'].transform(lambda x: x.shift(60))
df_amazon['qty_lag_1month'] = df_amazon.groupby(['ship-state', 'ship-city'])['Qty'].transform(lambda x: x.shift(30))
df_amazon['amount_rolling_30d'] = df_amazon.groupby(['ship-state', 'ship-city'])['Amount'].transform(lambda x: x.rolling(30, min_periods=1).mean())
df_amazon['next_month_amount'] = df_amazon.groupby(['ship-state', 'ship-city'])['Amount'].transform(lambda x: x.shift(-30))
cat_features = ['ship-state', 'ship-city', 'ship-postal-code']
num_features = ['year', 'month', 'amount_lag_1month', 'amount_lag_2month', 'qty_lag_1month', 'amount_rolling_30d']

df_model = df_amazon[cat_features + num_features + ['next_month_amount']].copy()
for col in cat_features:
    df_model[col] = df_model[col].fillna('Unknown').astype(str)
for col in num_features:
    df_model[col] = df_model[col].fillna(df_model[col].median())
df_model['next_month_amount'] = df_model['next_month_amount'].fillna(df_model['next_month_amount'].median())

print("Shape final:", df_model.shape)
X = df_model[cat_features + num_features].copy()
y = df_model['next_month_amount']
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f" NEXT MONTH AVERAGE ERROR: ${rmse:.2f}")
features_all = cat_features + num_features
coefs = pd.DataFrame({'feature': features_all, 'coef': model.coef_}).sort_values('coef', ascending=False)
print("\nTop 10 Features:")
print(coefs.head(10))
coefs.head(8).plot(kind='barh', x='feature', y='coef', figsize=(10,6))
plt.title('Next Month Sales Amount - Key Predictors by Location')
plt.xlabel('Coefficient')
plt.tight_layout()
plt.show()

In [None]:
# amount_lag_2month (0.37): Sales 2 months ago = best predictor of next month. Trends persist!
# amount_rolling_30d (0.29) 30-day rolling average = strong signal.
# amount_lag_1month (0.25) Recent history confirms momentum.
# qty_lag_1month (0.11) Historical quantity boosts future value.

In [None]:
# RMSE guarantees reliable decisions
# bETTER ORGANIZATION: Stock allocation, logistics planning

In [None]:
latest_data = df_amazon.sort_values('Date').tail(1000).copy()
latest_data['forecast_year'] = 2026
latest_data['forecast_month'] = 2  # Fevereiro
if 'amount_lag_1month' not in latest_data.columns:
    latest_data['amount_lag_1month'] = latest_data['Amount'].rolling(30, min_periods=1).mean()
if 'amount_lag_2month' not in latest_data.columns:
    latest_data['amount_lag_2month'] = latest_data['Amount'].rolling(60, min_periods=1).mean()
if 'qty_lag_1month' not in latest_data.columns:
    latest_data['qty_lag_1month'] = latest_data['Qty'].rolling(30, min_periods=1).mean()
if 'amount_rolling_30d' not in latest_data.columns:
    latest_data['amount_rolling_30d'] = latest_data['Amount'].rolling(30, min_periods=1).mean()
X_forecast = latest_data[cat_features + num_features].copy()
for col in cat_features:
    X_forecast[col] = X_forecast[col].fillna('Unknown').astype(str)
for col in num_features:
    X_forecast[col] = X_forecast[col].fillna(latest_data[col].median())
for col in cat_features:
    X_forecast[col] = label_encoders[col].transform(X_forecast[col])
X_forecast[num_features] = scaler.transform(X_forecast[num_features])
latest_data['feb2026_pred'] = model.predict(X_forecast)
top_states_fev = latest_data.groupby('ship-state')['feb2026_pred'].sum().sort_values(ascending=False).head(10)
print("üèÜ TOP 10 STATES - Fevereiro 2026 Predicted Revenue:")
print(top_states_fev.round(0).astype(int))
top_cities_fev = latest_data.groupby('ship-city')['feb2026_pred'].sum().sort_values(ascending=False).head(10)
print("\nüèôÔ∏è TOP 10 CITIES - FEBRUARY 2026:")
print(top_cities_fev.round(0).astype(int))
historical_avg = latest_data.groupby('ship-state')['Amount'].mean()
growth_fev = (top_states_fev / historical_avg).sort_values(ascending=False).head()
print("\nüìà Highest Growth States Feb 2026:")
print(growth_fev.round(2))