In [5]:
# import the necessary packages for SQL server connection
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from src.db.connections import SqlServerConnector

# Load and Pre-process Data

In [6]:
conn = SqlConnector(
    user="voliveira",
    password=os.environ.get("SQL_PASSWORD"),
    host="ROMPETECHOS",
    port="53373",
)

In [7]:
# Query data from the database
cabe_airzone = conn.query_data(
    query=f"SELECT * FROM dbo.CABEALBV", database="Airzone", instance="REPLICA"
)

lin_airzone = conn.query_data(
    query=f"SELECT * FROM dbo.LINEALBA", database="Airzone", instance="REPLICA"
)

cabe_airzone_france = conn.query_data(
    query=f"SELECT * FROM dbo.CABEALBV", database="AirzoneFrance", instance="REPLICA"
)

lin_airzone_france = conn.query_data(
    query=f"SELECT * FROM dbo.LINEALBA", database="AirzoneFrance", instance="REPLICA"
)

cabe_airzone_italia = conn.query_data(
    query=f"SELECT * FROM dbo.CABEALBV", database="AirzoneItalia", instance="REPLICA"
)

lin_airzone_italia = conn.query_data(
    query=f"SELECT * FROM dbo.LINEALBA", database="AirzoneItalia", instance="REPLICA"
)

cabe_airzone_usa = conn.query_data(
    query=f"SELECT * FROM dbo.CABEALBV", database="AirzoneUsa", instance="REPLICA"
)

lin_airzone_usa = conn.query_data(
    query=f"SELECT * FROM dbo.LINEALBA", database="AirzoneUsa", instance="REPLICA"
)

cabe_airzone_altra = conn.query_data(
    query=f"SELECT * FROM dbo.CABEALBV", database="Altra", instance="REPLICA"
)

lin_airzone_altra = conn.query_data(
    query=f"SELECT * FROM dbo.LINEALBA", database="Altra", instance="REPLICA"
)

In [None]:
# Add one column to each dataframe to identify the country
cabe_airzone["Country"] = "Spain"
cabe_airzone_france["Country"] = "France"
cabe_airzone_italia["Country"] = "Italia"
cabe_airzone_usa["Country"] = "USA"
cabe_airzone_altra["Country"] = "Altra"

In [None]:
# Add month and year colulmns based on FECHA to cabe dataframes
cabe_airzone["FECHA"] = pd.to_datetime(cabe_airzone["FECHA"])
cabe_airzone["Month"] = cabe_airzone["FECHA"].dt.month
cabe_airzone["Year"] = cabe_airzone["FECHA"].dt.year
cabe_airzone_france["FECHA"] = pd.to_datetime(cabe_airzone_france["FECHA"])
cabe_airzone_france["Month"] = cabe_airzone_france["FECHA"].dt.month
cabe_airzone_france["Year"] = cabe_airzone_france["FECHA"].dt.year
cabe_airzone_italia["FECHA"] = pd.to_datetime(cabe_airzone_italia["FECHA"])
cabe_airzone_italia["Month"] = cabe_airzone_italia["FECHA"].dt.month
cabe_airzone_italia["Year"] = cabe_airzone_italia["FECHA"].dt.year
cabe_airzone_usa["FECHA"] = pd.to_datetime(cabe_airzone_usa["FECHA"])
cabe_airzone_usa["Month"] = cabe_airzone_usa["FECHA"].dt.month
cabe_airzone_usa["Year"] = cabe_airzone_usa["FECHA"].dt.year
cabe_airzone_altra["FECHA"] = pd.to_datetime(cabe_airzone_altra["FECHA"])
cabe_airzone_altra["Month"] = cabe_airzone_altra["FECHA"].dt.month
cabe_airzone_altra["Year"] = cabe_airzone_altra["FECHA"].dt.year

In [100]:
# Check if the dataframes has the same columns
# Print the columns that are in cabe_airzone but that are not in cabe_airzone_france
print("Columns in cabe_airzone that are not in cabe_airzone_france:")
print(set(cabe_airzone.columns) - set(cabe_airzone_france.columns))
print("Columns in cabe_airzone_italia that are not in cabe_airzone_france:")
print(set(cabe_airzone_france.columns) - set(cabe_airzone_italia.columns))
print("Columns in cabe_airzone_usa that are not in cabe_airzone_italia:")
print(set(cabe_airzone_italia.columns) - set(cabe_airzone_usa.columns))
print("Columns in cabe_airzone_altra that are not in cabe_airzone_usa:")
print(set(cabe_airzone_usa.columns) - set(cabe_airzone_altra.columns))
print("Columns in cabe_airzone_usa that are not in cabe_airzone_altra:")
print(set(cabe_airzone_altra.columns) - set(cabe_airzone_usa.columns))

In [103]:
# Concat all cabe and lin dataframes
cabe = pd.concat(
    [
        cabe_airzone,
        cabe_airzone_france,
        cabe_airzone_italia,
        cabe_airzone_usa,
        cabe_airzone_altra,
    ]
)
lineas = pd.concat(
    [
        lin_airzone,
        lin_airzone_france,
        lin_airzone_italia,
        lin_airzone_usa,
        lin_airzone_altra,
    ]
)

In [106]:
# Clean SERIES field
cabe["SERIE"] = cabe["SERIE"].str.strip()

In [104]:
cabe.describe(include="all")

In [134]:
lineas.describe(include="all")

In [107]:
cabe["SERIE"].unique()

In [108]:
# Define some constants
SALES_CODES = ["1", "2", "C", "FA", "A"]
AFTER_SALES_CODE = ["3", "6", "FR", "FV"]

In [109]:
# Check the variables in memory
import sys


def get_size(obj):
    return sys.getsizeof(obj)


variables = globals().copy()
for name, var in variables.items():
    print(f"{name}: {get_size(var)} bytes")

In [None]:
# Clean memory
del (
    cabe_airzone,
    lin_airzone,
    cabe_airzone_france,
    lin_airzone_france,
    cabe_airzone_italia,
    lin_airzone_italia,
    cabe_airzone_usa,
    lin_airzone_usa,
    cabe_airzone_altra,
    lin_airzone_altra,
)

# Cabecera's analysis

In [136]:
"""# Add year and month columns based on FECHA to cabe dataframe
cabe['FECHA'] = pd.to_datetime(cabe['FECHA'])
cabe['Month'] = cabe['FECHA'].dt.month
cabe['Year'] = cabe['FECHA'].dt.year"""

In [137]:
cabe_sales = cabe[
    cabe["SERIE"].isin(SALES_CODES)
]  # Filter the data to get only the sales data
cabe_after_sales = cabe[
    cabe["SERIE"].isin(AFTER_SALES_CODE)
]  # Filter the data to get only the after sales data

In [175]:
# Plot number of sales by country
plt.figure(figsize=(20, 6))
sns.countplot(
    x="Country", data=cabe_sales, order=cabe_sales["Country"].value_counts().index
)
for i in range(cabe_sales["Country"].nunique()):
    plt.text(
        i,
        cabe_sales["Country"].value_counts().values[i],
        cabe_sales["Country"].value_counts().values[i],
        ha="center",
        va="bottom",
    )
plt.title("Number of sales by country")
plt.show()

In [174]:
# Plot sum of BASE by country
sum_base = (
    cabe_sales.groupby("Country")["BASE"]
    .sum()
    .sort_values(ascending=False)
    .to_frame()
    .reset_index()
)
plt.figure(figsize=(20, 6))
sns.barplot(x="Country", y="BASE", data=sum_base)
for i in range(sum_base.shape[0]):
    plt.text(
        i, sum_base["BASE"][i], f"{sum_base['BASE'][i]:,.0f}", ha="center", va="bottom"
    )
plt.title("Sum of BASE by country")
plt.show()

In [173]:
# Create subplot with two columns and online and Plot pie chart of sales and after sales by country
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
cabe_sales.groupby("Country").size().sort_values(ascending=False).plot.pie(
    ax=ax[0], autopct="%1.1f%%", startangle=90, legend=True
)
ax[0].set_title("Count of Sales Albaranes by country")
cabe_after_sales.groupby("Country").size().sort_values(ascending=False).plot.pie(
    ax=ax[1], autopct="%1.1f%%", startangle=90, legend=True
)
ax[1].set_title("Count of After sales Albaranes by country")
plt.show()

In [172]:
# Plot sum of BASE by month
sum_base_month = cabe_sales.groupby(["Year", "Month"])["BASE"].sum().to_frame()
plt.figure(figsize=(20, 6))
sns.barplot(x="Month", y="BASE", hue="Year", data=sum_base_month)
plt.title("Sum of BASE by month")
plt.show()

In [168]:
sum_base_month_sales = cabe_sales.groupby(["Year", "Month"])["BASE"].sum().to_frame()
sum_base_month_sales[:-1].plot(figsize=(20, 6))
plt.title("Sum of BASE for Sales by month")
plt.show()

In [169]:
sum_base_month_after_sales = (
    cabe_after_sales.groupby(["Year", "Month"])["BASE"].sum().to_frame()
)
sum_base_month_after_sales[:-1].plot(figsize=(20, 6), c="r")
plt.title("Sum of BASE for After Sales by month")
plt.show()

In [171]:
sum_base_month = sum_base_month_sales.merge(
    sum_base_month_after_sales,
    how="outer",
    on=["Year", "Month"],
    suffixes=("_sales", "_after_sales"),
)
sum_base_month[:-1].plot(figsize=(20, 6))
plt.title("Sum of BASE for Sales and After Sales by month")
plt.show()

# Query list of errors

In [5]:
# Query data from the database
errors = query_data("SELECT * FROM dbo.AZ_ERRORES_POSTVENTA", "Altra")
# Save the data to a csv file
errors.to_csv("../DATA/errors.csv", index=False, sep=";")

In [7]:
errors.describe(include="all")

# Lineas's analysis

In [176]:
# Merge cabe and lineas dataframes
lineas = lineas.merge(cabe[["IDALBV", "SERIE", "Country"]], on="IDALBV", how="left")

lineas_sales = lineas[
    lineas["SERIE"].isin(SALES_CODES)
]  # Filter the data to get only the sales data
lineas_after_sales = lineas[
    lineas["SERIE"].isin(AFTER_SALES_CODE)
]  # Filter the data to get only the after sales data