In [None]:
# DataFrame
import pandas as pd

# Numerical Python
import numpy as np

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Plotly
import plotly.express as px
from plotly.subplots import make_subplots

import datetime as dt

In [None]:
pd.options.plotting.backend = "plotly" # Use plotly as the plotting backend

In [None]:
gdp_df = pd.read_csv('inflation.csv')
gdp_df['Date'] = pd.to_datetime(gdp_df['Date'])
gdp_df['Date'] = gdp_df['Date'].map(dt.datetime.toordinal)
# Revert back
# gdp_df.index = gdp_df.index.map(dt.datetime.fromordinal)

In [None]:
labels = np.array(gdp_df['Inflation'])
features = gdp_df.drop('Inflation', axis = 1)
feature_list = list(gdp_df.columns)
features = np.array(features)

In [None]:
labels

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('Date')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
predictions = rf.predict(features)

In [None]:
# Use datetime for creating date objects for plotting
import datetime
# Dates of training values

dates = [datetime.datetime.fromordinal(date) for date in features[:, feature_list.index("Date")]]
# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'actual': labels})

test_dates = [datetime.datetime.fromordinal(date) for date in features[:, feature_list.index("Date")]]

# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions})

true_data.set_index('date', inplace=True)
true_data.sort_index(inplace=True)

predictions_data.set_index('date', inplace=True)
predictions_data.sort_index(inplace=True)


In [None]:
true_data

In [None]:
predictions_data

In [None]:
df = pd.merge(true_data, predictions_data, left_index=True, right_index=True)
df.plot(title='Inflation Predictions', x=['actual', 'prediction'], y='date', labels={'value':'Inflation Rate', 'index':'Date'})

In [None]:
df["prediction"] = df["prediction"].round(2)
df['Prediction Signal'] = np.where(df['actual'].diff()>0, "UP", "DOWN")
df.to_csv('inflation_predictions.csv')

In [None]:
# Significance Testing
from scipy.stats import ttest_ind
import scipy.stats as stats
import math

actual = df["actual"]
prediction = df["prediction"]

#mean
actual_mean, prediction_mean = actual.mean(), prediction.mean()

#standard deviation
actual_sd, prediction_sd = actual.std(ddof=1), prediction.std(ddof=1)

#standard error
actual_n, prediction_n = len(actual), len(prediction)
actual_se, prediction_se = actual_sd/math.sqrt(actual_n), prediction_sd/math.sqrt(prediction_n)

#standard error on the difference between men and women
se_diff = math.sqrt(actual_se**2.0 + prediction_se**2.0)

#t-stat
t_stat = (actual_mean - prediction_mean) / se_diff

#degrees of freedom
degrees_freedom = (actual_sd**2/actual_n + prediction_sd**2/prediction_n)**2 / (actual_sd**4/actual_n**2/(actual_n-1) + prediction_sd**4/prediction_n**2/(prediction_n-1))

#critical value
alpha = 0.05
cv = stats.t.ppf(1.0 - alpha, degrees_freedom)

# p-value
p = (1 - stats.t.cdf(abs(t_stat), degrees_freedom)) * 2
print("ttest:", t_stat, "critical value:", cv, "p-value:", p)
print("actual mean:", actual_mean, "prediction mean:", prediction_mean)
print("actual sd:", actual_sd, "prediction sd:", prediction_sd)
print("actual se:", actual_se, "prediction se:", prediction_se)
print("actual n:", actual_n, "prediction n:", prediction_n)
print("se on diff:", se_diff)
print("degrees of freedom:", round(degrees_freedom))