In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
data = pd.read_csv('/content/denoised_final_transactions_dataset.csv')
data['date_BUY_fix'] = pd.to_datetime(data['date_BUY_fix'])

In [28]:
### Feature Selection
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = data[['PE_ratio', 'EPS_ratio', 'PS_ratio', 'PB_ratio', 'NetProfitMargin_ratio',
          'current_ratio', 'roa_ratio', 'roe_ratio']]
y = data['price_BUY']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Evaluation model
predictions = model.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have a regression problem and predictions are continuous.
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


# Select the most important 5 features
feature_importances = model.feature_importances_
sorted_idx = np.argsort(feature_importances)[-5:]
top_features = X.columns[sorted_idx]

print("Top 5 features selected by XGBoost:", top_features)

Mean Squared Error: 157.75
R-squared: 0.99
Top 5 features selected by XGBoost: Index(['roa_ratio', 'current_ratio', 'PB_ratio', 'PE_ratio', 'EPS_ratio'], dtype='object')


In [29]:
### PCA reduction
from sklearn.decomposition import PCA

features = data[['roa_ratio', 'current_ratio', 'PB_ratio', 'PE_ratio', 'EPS_ratio']]


pca1 = PCA(n_components=2)
pca2 = PCA(n_components=3)

transformed_features1 = pca1.fit_transform(features)
transformed_features2 = pca2.fit_transform(features)

transformed_data1 = pd.DataFrame(data=transformed_features1, columns=['PCA4', 'PCA5'])
transformed_data2 = pd.DataFrame(data=transformed_features2, columns=['PCA1', 'PCA2', 'PCA3'])

# Obtain the proportion of variance explained by each principal component
explained_variance_ratio1 = pca1.explained_variance_ratio_
explained_variance_ratio2 = pca2.explained_variance_ratio_
# Calculate the total variance explained rate
total_variance_explained1 = np.sum(explained_variance_ratio1)
total_variance_explained2 = np.sum(explained_variance_ratio2)

print("Total variance explained by ['PCA4', 'PCA5']:", total_variance_explained1)
print("Total variance explained by ['PCA1', 'PCA2', 'PCA3']:", total_variance_explained2)


Total variance explained by ['PCA4', 'PCA5']: 0.9929001300040743
Total variance explained by ['PCA1', 'PCA2', 'PCA3']: 0.9971808419413767


In [30]:
transformed_data1

Unnamed: 0,PCA4,PCA5
0,-19.725539,10.810920
1,-19.725539,10.810920
2,-19.725539,10.810920
3,-19.725539,10.810920
4,-19.725539,10.810920
...,...,...
405253,24.896901,-3.427554
405254,24.896901,-3.427554
405255,24.896901,-3.427554
405256,24.896901,-3.427554


In [31]:
transformed_data2

Unnamed: 0,PCA1,PCA2,PCA3
0,-19.725539,10.810920,0.409340
1,-19.725539,10.810920,0.409340
2,-19.725539,10.810920,0.409340
3,-19.725539,10.810920,0.409340
4,-19.725539,10.810920,0.409340
...,...,...,...
405253,24.896901,-3.427554,-2.220212
405254,24.896901,-3.427554,-2.220212
405255,24.896901,-3.427554,-2.220212
405256,24.896901,-3.427554,-2.220212


In [35]:
# Merge the data after dimensionality reduction with the original data
merged_data = pd.concat([data, transformed_data1, transformed_data2], axis=1)

from google.colab import files
merged_data.to_csv('dimension_reduction.csv', index=False)
files.download('dimension_reduction.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>