In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

The dataset is built from the initial dataset consisted of 600000 transactional data collected in 6 years (period 2014-2019), indicating date and time of sale, pharmaceutical drug brand name and sold quantity, exported from Point-of-Sale system in the individual pharmacy. Selected group of drugs from the dataset (57 drugs) is classified to the following Anatomical Therapeutic Chemical (ATC) Classification System categories:

M01AB - Anti-inflammatory and antirheumatic products, non-steroids, Acetic acid derivatives and related substances
M01AE - Anti-inflammatory and antirheumatic products, non-steroids, Propionic acid derivatives
N02BA - Other analgesics and antipyretics, Salicylic acid and derivatives
N02BE/B - Other analgesics and antipyretics, Pyrazolones and Anilides
N05B - Psycholeptics drugs, Anxiolytic drugs
N05C - Psycholeptics drugs, Hypnotics and sedatives drugs
R03 - Drugs for obstructive airway diseases
R06 - Antihistamines for systemic use
Sales data are resampled to the hourly, daily, weekly and monthly periods. Data is already pre-processed, where processing included outlier detection and treatment and missing data imputation.

https://www.kaggle.com/datasets/milanzdravkovic/pharma-sales-data 

In [3]:
# Load daily sales data 
daily_df = pd.read_csv('Resources/salesdaily.csv')
daily_df.head()

Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday Name
0,1/2/2014,0.0,3.67,3.4,32.4,7.0,0.0,0.0,2.0,2014,1,248,Thursday
1,1/3/2014,8.0,4.0,4.4,50.6,16.0,0.0,20.0,4.0,2014,1,276,Friday
2,1/4/2014,2.0,1.0,6.5,61.85,10.0,0.0,9.0,1.0,2014,1,276,Saturday
3,1/5/2014,4.0,3.0,7.0,41.1,8.0,0.0,3.0,0.0,2014,1,276,Sunday
4,1/6/2014,5.0,1.0,4.5,21.7,16.0,2.0,6.0,2.0,2014,1,276,Monday


In [4]:
# Check data types
daily_df.dtypes

datum            object
M01AB           float64
M01AE           float64
N02BA           float64
N02BE           float64
N05B            float64
N05C            float64
R03             float64
R06             float64
Year              int64
Month             int64
Hour              int64
Weekday Name     object
dtype: object

In [5]:
# convert datum column to datetime
daily_df["datum"]=pd.to_datetime(daily_df["datum"])

In [6]:
# create a "Day" column based on the datum
daily_df["Day"]=daily_df["datum"].dt.day
daily_df.head()

Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday Name,Day
0,2014-01-02,0.0,3.67,3.4,32.4,7.0,0.0,0.0,2.0,2014,1,248,Thursday,2
1,2014-01-03,8.0,4.0,4.4,50.6,16.0,0.0,20.0,4.0,2014,1,276,Friday,3
2,2014-01-04,2.0,1.0,6.5,61.85,10.0,0.0,9.0,1.0,2014,1,276,Saturday,4
3,2014-01-05,4.0,3.0,7.0,41.1,8.0,0.0,3.0,0.0,2014,1,276,Sunday,5
4,2014-01-06,5.0,1.0,4.5,21.7,16.0,2.0,6.0,2.0,2014,1,276,Monday,6


In [9]:
# convert dataframe from wide to long
updated_df = pd.melt(daily_df, id_vars=["Year", "Month"],
                     value_vars=["M01AB", "M01AE", "N02BA", "N02BE", "N05B", "N05C", "R03", "R06"],
                     var_name="Drug Type",
                     value_name="Usage")

In [11]:
# calculate total usage per month
month_df = updated_df.groupby(["Drug Type", "Month"]).sum().reset_index()

In [17]:
# plot usage per month
fig_month = px.bar(month_df, x="Month", y="Usage", color="Drug Type", 
                   title="Total Usage per Month by Drug Type", barmode="group")
fig_month.show()


In [19]:
# calculate total usage per year
year_df = updated_df.groupby(["Drug Type", "Year"]).sum().reset_index()

# plot usage per month
fig_year = px.bar(year_df, x="Year", y="Usage", color="Drug Type", 
                   title="Total Usage per Year by Drug Type", barmode="group")
fig_year.show()