In [1]:
import requests
import json
import requests
import pandas as pd
from plotly import graph_objects as go
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
%run roadspending_gdp.ipynb
%run trips_duration.ipynb

Note: you may need to restart the kernel to use updated packages.


object




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [3]:
def merging_data(final_gdp_roadspending_df, final_1plus_df):

    tourist_num = final_1plus_df
    gdp = final_gdp_roadspending_df[final_gdp_roadspending_df['Year'] > 2011]

    min_trips = tourist_num['NumTrips'].min()
    max_trips = tourist_num['NumTrips'].max()

    min_gdp = gdp['GDP'].min()
    max_gdp = gdp['GDP'].max()

    tourist_num['Tourism St.'] = (tourist_num['NumTrips'] - min_trips) / (max_trips - min_trips)
    gdp['GDP St.'] = (gdp['GDP'] - min_gdp) / (max_gdp - min_gdp)

    tourist_num = pd.DataFrame(final_1plus_df, columns = ['Country', 'Year', 'Tourism St.'])
    gdp_actual = pd.DataFrame(gdp, columns = ["Country", "Year", "GDP St."])

    prediction_data = pd.merge(tourist_num, gdp, on = ['Country', 'Year'])
    prediction_data["Year"] = prediction_data['Year'].astype(int)


    return prediction_data




In [4]:
data = merging_data(final_gdp_roadspending_df, final_1plus_df)
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Country,Year,Tourism St.,RoadSpending,GDP,Spending/GDP%,GDP St.
0,Austria,2012,0.067236,,3.645983e+11,,0.088557
1,Austria,2013,0.068143,,3.828753e+11,,0.093103
2,Austria,2014,0.068527,,3.940021e+11,,0.095870
3,Austria,2015,0.066879,,3.399889e+11,,0.082436
4,Austria,2016,0.073633,,3.521979e+11,,0.085472
...,...,...,...,...,...,...,...
292,Sweden,2018,0.169462,2.497002e+09,4.906677e+11,0.508899,0.119913
293,Sweden,2019,0.169677,2.503424e+09,4.736304e+11,0.528561,0.115676
294,Sweden,2020,0.186782,2.941620e+09,4.851817e+11,0.606292,0.118549
295,Sweden,2021,0.198989,2.910555e+09,5.670964e+11,0.513238,0.138923


In [5]:
def get_country_prediction(data, country, num_years):
    data["Year"] = data['Year'].astype(int)
    country_data = data[(data['Country'] == country) & (~data['Year'].isin([2020, 2021]))]
    country_data = country_data.sort_values(by = 'Year').reset_index(drop = True)


    # creating lag and bias columns
    country_data['GDP_lag'] = country_data['GDP St.'].shift(1)
    country_data['Tourism_lag'] = country_data['Tourism St.'].shift(1)
    country_data['Bias'] = 1
    country_data = country_data.dropna().reset_index(drop = True)

    if len(country_data) < 3:
        print(f"⚠️ Not enough usable data for {country}. Skipping.")
        return pd.DataFrame()

    X = country_data[['Bias', 'GDP_lag', 'Tourism_lag']].values
    y_gdp = country_data['GDP St.'].values
    y_tourism = country_data['Tourism St.'].values

    Xt = X.T
    XtX_inv = np.linalg.pinv(np.matmul(Xt, X))
    beta_gdp = np.matmul(np.matmul(XtX_inv, Xt), y_gdp)
    beta_tourism = np.matmul(np.matmul(XtX_inv, Xt), y_tourism)

    last_year = int(country_data['Year'].iloc[-1])
    current_gdp = country_data['GDP St.'].iloc[-1]
    current_tourism = country_data['Tourism St.'].iloc[-1]

    predictions = []
    for _ in range(num_years):
        x_input = np.array([1, current_gdp, current_tourism])
        next_gdp = np.matmul(x_input, beta_gdp)
        next_tourism = np.matmul(x_input, beta_tourism)

        next_year = last_year + 1
        while next_year in [2020, 2021]:
            next_year += 1

        predictions.append({
            'Country': country,
            'Year': next_year,
            'Predicted_GDP': round(next_gdp, 2),
            'Predicted_Tourism': round(next_tourism, 2)
        })

        last_year = next_year
        current_gdp = next_gdp
        current_tourism = next_tourism
    
    prediction_df = pd.DataFrame(predictions)

    return prediction_df


In [6]:
# test run

forecast = get_country_prediction(data, country = 'Austria', num_years = 5)
forecast

⚠️ Not enough usable data for Austria. Skipping.


In [7]:
def summary_stat(data):

    data['GDP_lag'] = data['GDP St.'].shift(1)
    data['Tourism_lag'] = data['Tourism St.'].shift(1)
    data['Bias'] = 1

    country_data = data.dropna().reset_index(drop=True)
    country_data = data.dropna().reset_index(drop=True)
    
    X = country_data[['Bias', 'GDP_lag', 'Tourism_lag']].values
    y_gdp = country_data['GDP St.'].values
    y_tourism = country_data['Tourism St.'].values
    
    m_gdp = np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, y_gdp))
    m_tourism = np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, y_tourism))

    np.set_printoptions(suppress=True)

    ypreds_tourism= np.matmul(X, m_tourism)
    ypreds_gdp = np.matmul(X, m_gdp)
    r2_gdp = r2_score(y_gdp, ypreds_gdp)
    r2_tourism = r2_score(y_tourism, ypreds_tourism)
    
    print(f'The r^2 prediction score for GDP is: {round(r2_gdp, 4)}')
    print(f'The r^2 prediction score for tourism is: {round(r2_tourism, 4)}')

In [8]:
summary_stat(data)

The r^2 prediction score for GDP is: 0.8545
The r^2 prediction score for tourism is: 0.8294


In [9]:
def train_test(data): 
    y_GDP = pd.array(data['GDP St.'])
    y_tourism = pd.array(data['Tourism St.'])
    X = pd.array(data['Year'])

# Split the data into training and testing sets
    X_train_gdp, X_test_gdp, y_train_gdp, y_test_gdp = train_test_split(X, y_GDP, test_size=0.3, random_state=42)
    X_train_tourism, X_test_tourism, y_train_tourism, y_test_tourism = train_test_split(X, y_tourism, test_size=0.3, random_state=42)

    return {"gdp": (X_train_gdp, X_test_gdp, y_train_gdp, y_test_gdp),
        "tourism": (X_train_tourism, X_test_tourism, y_train_tourism, y_test_tourism)}


In [10]:
train_test(data)


{'gdp': (<NumpyExtensionArray>
  [2014, 2013, 2016, 2013, 2012, 2015, 2019, 2022, 2021, 2019,
   ...
   2015, 2022, 2013, 2022, 2020, 2020, 2023, 2022, 2018, 2018]
  Length: 207, dtype: int64,
  <NumpyExtensionArray>
  [2023, 2019, 2015, 2022, 2017, 2017, 2015, 2014, 2021, 2019, 2021, 2018, 2020,
   2017, 2018, 2018, 2019, 2021, 2023, 2016, 2017, 2021, 2012, 2022, 2014, 2021,
   2019, 2018, 2017, 2015, 2015, 2015, 2012, 2022, 2019, 2015, 2012, 2012, 2018,
   2013, 2020, 2018, 2017, 2023, 2015, 2016, 2013, 2017, 2020, 2016, 2021, 2020,
   2014, 2023, 2012, 2018, 2013, 2020, 2018, 2013, 2021, 2019, 2018, 2022, 2013,
   2020, 2013, 2018, 2012, 2022, 2016, 2017, 2013, 2019, 2016, 2018, 2020, 2015,
   2014, 2019, 2015, 2019, 2018, 2017, 2012, 2015, 2022, 2012, 2016, 2020]
  Length: 90, dtype: int64,
  <NumpyExtensionArray>
  [ 0.00883496299746336, 0.011063017984045222,  0.10322657337982327,
    0.03989336327390599,  0.04821342752335252, 0.007326792244996174,
    0.07433181040480649,   0.318