In [None]:
import os
import re
import dotenv
import numpy as np
from numpy import inf
import pandas as pd
from pprint import pprint

from sklearn import tree
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import dtreeviz

In [None]:
# Load environment variables
project_dir = os.path.join(os.path.abspath(''), os.pardir)
dotenv_path = os.path.join(project_dir, '.env')

dotenv.load_dotenv(dotenv_path)

In [1]:
df = pd.read_csv(os.path.join(os.getenv('PROCESSED_DATA_FILES'), 'df_train_test.csv'), encoding='UTF-8')

NameError: name 'pd' is not defined

In [None]:
X = df.loc[:, ["post_id", "log_init_imps", "log_init_engs", "Dominant_Topic", "Keywords"]]
X = X.set_index('post_id')
X = pd.concat([X.drop(columns=['Keywords']), X['Keywords'].str.get_dummies(sep=",")], axis=1)
X = pd.concat([X.drop(columns=['Dominant_Topic']), X['Dominant_Topic'].astype(str).str.get_dummies()], axis=1)
#X = pd.concat([df_ready.drop(columns=['Keywords']), df_ready['Keywords'].str.get_dummies(sep=",")], axis=1)
#X = pd.concat([df_ready.drop(columns=['Dominant_Topic']), df_ready['Dominant_Topic'].str.get_dummies()], axis=1)
y = df.loc[:, ["post_id", "log_all_engrate"]]
y = y.set_index('post_id')
y = y.squeeze()
print(X.head())
print(y.head())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df.class_weight.values.tolist())

In [None]:
# fit the regressor
reg = tree.DecisionTreeRegressor(max_depth=6, random_state=42)
reg.fit(X_train, y_train)
# plot the tree
viz_model = dtreeviz.model(reg,
               X_train=X_train, y_train=y_train,
               target_name='log_all_engrate',
               feature_names=X_train.columns)
v = viz_model.view()     # render as SVG into internal object 
v.show()

In [None]:
train_score = reg.score(X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"Train R-squared: {train_score:.2f}")
print(f"Test R-squared: {test_score:.2f}")

In [None]:
# scale the predictions back to normal and recalculate R Score
# scale back true values (y)
y_test_normal = np.exp(y_test)
y_train_normal = np.exp(y_train)

# get train/test predictions
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

# scale back predictions
y_train_pred_normal = np.exp(y_train_pred)
y_test_pred_normal = np.exp(y_test_pred)

# Assuming 'y_true' contains the true target values and 'y_pred' contains the predicted values
r_squared_train = r2_score(y_train_normal, y_train_pred_normal)
r_squared_test = r2_score(y_test_normal, y_test_pred_normal)

print(f"Train (normal) R-squared Score: {r_squared_train:.2f}")
print(f"Test (normal) R-squared Score: {r_squared_test:.2f}")