# Imports

In [1]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

from utils import get_dataframe

# Build dataframe

In [2]:
def get_presenze_visibili():
    df = get_dataframe("movimento_turistico_molveno")
    df = df[["date", "presenze"]].rename(columns={"date": "data"})
    return df

def get_presenze_vodafone():
    locId_MOLVENO = "27"
    userProfile_TOURISTS = "TOURIST"
    df = get_dataframe("vodafone_attendences")
    df = df[(df["locId"] == locId_MOLVENO) & (df["userProfile"] == userProfile_TOURISTS)]
    df = df[["date", "value"]].rename(columns={"date": "data", "value": "presenze"}).groupby(["data"]).sum().reset_index()
    df["data"] = pd.to_datetime(df["data"])
    return df

def prepare_dataframe():
    df_presenze = get_presenze_visibili()
    df_presenze_vodafone = get_presenze_vodafone()
    df = df_presenze.merge(df_presenze_vodafone.rename(columns={"presenze":"presenze_vodafone"}), how="inner", on="data", )
    df["mese"] = df["data"].dt.month
    df["anno"] = df["data"].dt.year
    df["giorno_settimana"] = df["data"].dt.weekday
    return df

In [4]:
df_hidden_tourism = prepare_dataframe()
df_hidden_tourism

Unnamed: 0,data,presenze,presenze_vodafone,mese,anno,giorno_settimana
0,2022-01-01,2102,4550,1,2022,5
1,2022-01-02,1783,5110,1,2022,6
2,2022-01-03,1787,3643,1,2022,0
3,2022-01-04,1748,3337,1,2022,1
4,2022-01-05,1649,3255,1,2022,2
...,...,...,...,...,...,...
725,2023-12-27,1494,2033,12,2023,2
726,2023-12-28,1802,2346,12,2023,3
727,2023-12-29,2138,2949,12,2023,4
728,2023-12-30,2371,3655,12,2023,5


# Plots

In [21]:
day_colors = ["violet", "purple", "red", "orange", "yellow", "green", "blue"]

def plot_weekday(df, days=range(7)):
    fig = go.Figure()
    for d in days:
        df_day = df[df["giorno_settimana"] == d]
        fig.add_trace(go.Scatter(
            x=df_day["presenze_vodafone"],
            y=df_day["presenze"],
            mode="markers",
            marker=dict(color=day_colors[d]),
            name=f"Day {d}"
        ))
        X = df_day["presenze_vodafone"].values.reshape(-1,1)
        y = df_day["presenze"].values.reshape(-1,1)
        reg = LinearRegression().fit(X, y)
        x0, x1 = 0, df["presenze_vodafone"].max()
        ys = reg.predict([[x0],[x1]])
        y0, y1 = ys[0,0], ys[1,0]
        fig.add_trace(go.Scatter(
            x=[x0, x1],
            y=[y0, y1],
            mode="lines",
            marker=dict(color=day_colors[d]),
            line=dict(color=day_colors[d], width=2),
            showlegend=False,
            name=f"Day {d}"
        ))
    fig.update_xaxes(
        title="presenze_vodafone",
        range=[-800, df["presenze_vodafone"].max()+800]
    )
    fig.update_yaxes(
        title="presenze",
        range=[-800, df["presenze"].max()+800]
    )
    fig.show()

plot_weekday(df_hidden_tourism)

In [22]:
month_colors = [ "black",
    "#1f77b4", "#4e88bf", "#2ca02c", "#98df8a",
    "#d6e685", "#ffbb78", "#bcbd22", "#db9334",
    "#ff7f0e", "#d62728", "#8c564b", "#1f4b99"
]

def plot_month(df, months=range(1,13)):
    fig = go.Figure()
    for m in months:
        df_month = df[df["mese"] == m]
        fig.add_trace(go.Scatter(
            x=df_month["presenze_vodafone"],
            y=df_month["presenze"],
            mode="markers",
            marker=dict(color=month_colors[m]),
            name=f"Month {m}"
        ))
        X = df_month["presenze_vodafone"].values.reshape(-1,1)
        y = df_month["presenze"].values.reshape(-1,1)
        reg = LinearRegression().fit(X, y)
        x0, x1 = 0, df["presenze_vodafone"].max()
        ys = reg.predict([[x0],[x1]])
        y0, y1 = ys[0,0], ys[1,0]
        fig.add_trace(go.Scatter(
            x=[x0, x1],
            y=[y0, y1],
            mode="lines",
            marker=dict(color=month_colors[m]),
            line=dict(color=month_colors[m], width=2),
            showlegend=False,
            name=f"Month {m}"
        ))
    fig.update_xaxes(
        title="presenze_vodafone",
        range=[-800, df["presenze_vodafone"].max()+800]
    )
    fig.update_yaxes(
        title="presenze",
        range=[-800, df["presenze"].max()+800]
    )
    fig.show()

plot_month(df_hidden_tourism)
plot_month(df_hidden_tourism, months=[5,6,9])
plot_month(df_hidden_tourism, months=[7,8])

### Analysis of the plots
The analysis of the plots shows that we need to concentrate on:
- days: 1-5 vs 6-7
- months: 5,6,9 vs 7,8

# Model

We concentrate only on months (since this can be replicated to other zones)

In [31]:
ref_months = [5,6,9]
target_months = [7,8]

In [32]:
def build_ht_model(df_hidden_tourism, ref_months):
    df = df_hidden_tourism[df_hidden_tourism["mese"].isin(ref_months)]
    X = df["presenze_vodafone"].values.reshape(-1,1)
    y = df["presenze"].values.reshape(-1,1)
    ht_model = LinearRegression().fit(X, y)
    scores = cross_validate(ht_model, X, y, cv=5,
                            scoring=["r2", "neg_mean_absolute_error",
                                     "neg_root_mean_squared_error"])
    return ht_model, pd.DataFrame(scores).describe().drop(columns=["fit_time", "score_time"])

ht_model, validation = build_ht_model(df_hidden_tourism, ref_months)
print(f">> Intercept={ht_model.intercept_[0]:.3f}, slope={ht_model.coef_[0,0]:.3f}")
validation

>> Intercept=228.587, scale=0.660


Unnamed: 0,test_r2,test_neg_mean_absolute_error,test_neg_root_mean_squared_error
count,5.0,5.0,5.0
mean,0.41923,-302.793857,-381.733194
std,0.196156,93.000437,96.739749
min,0.266547,-418.438042,-510.713318
25%,0.302529,-350.053077,-413.344307
50%,0.373118,-330.520382,-408.816656
75%,0.396586,-212.512172,-314.865926
max,0.757369,-202.44561,-260.925762


### Validation outcomes
Acceptable but not very robust

# Application to target months

In [38]:
df_hidden_tourism["presenze_normalizzate"] = \
    ht_model.intercept_[0] + df_hidden_tourism["presenze_vodafone"] * ht_model.coef_[0,0]

df_hidden_tourism["turismo_nascosto"] = \
    df_hidden_tourism["presenze_normalizzate"] - df_hidden_tourism["presenze"]

df_hidden_tourism_target_months = df_hidden_tourism[df_hidden_tourism["mese"].isin(target_months)]
print(f"Presenze ufficiali: {df_hidden_tourism_target_months['presenze'].sum()}")
print(f"Turismo nascosto: {df_hidden_tourism_target_months['turismo_nascosto'].sum():.0f}")

Presenze ufficiali: 382749
Turismo nascosto: 10290
