In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import os


In [2]:
# Load the data
cpi_data = pd.read_csv('C:/Users/tayla/OneDrive/Masaüstü/arsiv/CPALTT01USM657N (1).csv')
gdp_data = pd.read_csv('C:/Users/tayla/OneDrive/Masaüstü/arsiv/GDP (2).csv')
stock_data = pd.read_csv('C:/Users/tayla/OneDrive/Masaüstü/arsiv/Stock Market Dataset.csv')
unemployment_data = pd.read_csv('C:/Users/tayla/OneDrive/Masaüstü/arsiv/UNRATE (2).csv')


In [3]:
# Ensure consistent date formats and set dates as index
cpi_data['DATE'] = pd.to_datetime(cpi_data['DATE'])
gdp_data['DATE'] = pd.to_datetime(gdp_data['DATE'])
unemployment_data['DATE'] = pd.to_datetime(unemployment_data['DATE'])
stock_data['Date'] = pd.to_datetime(stock_data['Date'], dayfirst=True)

cpi_data.set_index('DATE', inplace=True)
gdp_data.set_index('DATE', inplace=True)
unemployment_data.set_index('DATE', inplace=True)
stock_data.set_index('Date', inplace=True)

# Sort indexes to ensure proper slicing
cpi_data = cpi_data.sort_index()
gdp_data = gdp_data.sort_index()
unemployment_data = unemployment_data.sort_index()
stock_data = stock_data.sort_index()


In [4]:
# Print date ranges to debug
print("CPI Date Range:", cpi_data.index.min(), "to", cpi_data.index.max())
print("GDP Date Range:", gdp_data.index.min(), "to", gdp_data.index.max())
print("Unemployment Date Range:", unemployment_data.index.min(), "to", unemployment_data.index.max())
print("Stock Data Date Range:", stock_data.index.min(), "to", stock_data.index.max())


CPI Date Range: 1960-01-01 00:00:00 to 2024-03-01 00:00:00
GDP Date Range: 1947-01-01 00:00:00 to 2024-01-01 00:00:00
Unemployment Date Range: 1948-01-01 00:00:00 to 2024-02-01 00:00:00
Stock Data Date Range: 2019-02-04 00:00:00 to 2024-02-02 00:00:00


In [5]:
# Convert stock prices to numeric, handling errors
stock_data = stock_data.apply(pd.to_numeric, errors='coerce')

# Resample to monthly/quarterly for GDP
gdp_data = gdp_data.resample('QE-DEC').mean()
cpi_data = cpi_data.resample('ME').mean()
unemployment_data = unemployment_data.resample('ME').mean()
stock_data = stock_data.resample('ME').mean()


In [6]:
# Filter data for the date range 2020-2024
start_date = '2020-01-01'
end_date = '2024-01-01'
gdp_data = gdp_data[start_date:end_date]
cpi_data = cpi_data[start_date:end_date]
unemployment_data = unemployment_data[start_date:end_date]
stock_data = stock_data[start_date:end_date]


In [7]:
# Print shapes of datasets to debug
print("Filtered GDP Data Shape:", gdp_data.shape)
print("Filtered CPI Data Shape:", cpi_data.shape)
print("Filtered Unemployment Data Shape:", unemployment_data.shape)
print("Filtered Stock Data Shape:", stock_data.shape)


Filtered GDP Data Shape: (16, 1)
Filtered CPI Data Shape: (48, 1)
Filtered Unemployment Data Shape: (48, 1)
Filtered Stock Data Shape: (48, 38)


In [8]:
# Remove unnecessary columns
problematic_columns = ['Unnamed: 0', 'Bitcoin_Price', 'Platinum_Price', 'Ethereum_Price', 'S&P_500_Price', 'Nasdaq_100_Price', 'Berkshire_Price', 'Gold_Price']
stock_data = stock_data.drop(columns=problematic_columns, errors='ignore')


In [9]:
# Combine datasets
combined_data = gdp_data.join(cpi_data, how='inner').join(unemployment_data, how='inner')
combined_data.columns = ['GDP', 'CPI', 'Unemployment_Rate']
combined_data = combined_data.join(stock_data, how='inner')

# Drop rows with NaN values
combined_data = combined_data.dropna()

# Print intermediate steps to debug
print("Combined Data Shape after dropna:", combined_data.shape)
print("Combined Data Head:\n", combined_data.head())


Combined Data Shape after dropna: (13, 33)
Combined Data Head:
                   GDP       CPI  Unemployment_Rate  Natural_Gas_Price  \
2020-09-30  21647.640  0.139275                7.8           2.274250   
2020-12-31  22024.502  0.094148                6.7           2.584273   
2021-03-31  22600.185  0.708327                6.1           2.622043   
2021-06-30  23292.362  0.929066                5.9           3.280429   
2021-09-30  23828.973  0.271597                4.7           5.142200   

            Natural_Gas_Vol.  Crude_oil_Price  Crude_oil_Vol.  Copper_Price  \
2020-09-30     146865.500000        39.769000   301370.500000      3.026900   
2020-12-31     135084.090909        47.068182   282186.363636      3.538523   
2021-03-31      93239.565217        62.357391   425732.608696      4.080065   
2021-06-30     120756.190476        71.525714   343877.142857      4.381619   
2021-09-30     167342.000000        71.704500   363229.500000      4.265900   

             Copper_Vo

In [10]:
# Calculate correlations
correlation_results = {}
indicators = ['GDP', 'CPI', 'Unemployment_Rate']

for stock in stock_data.columns:
    for indicator in indicators:
        correlation = combined_data[indicator].corr(combined_data[stock])
        correlation_results[(stock, indicator)] = correlation


In [11]:
# Find the best correlation
best_corr_pair = max(correlation_results, key=correlation_results.get)
best_stock, best_indicator = best_corr_pair
best_corr = correlation_results[best_corr_pair]

print(f"The best correlation is between {best_stock} and {best_indicator} with a correlation of {best_corr}")


The best correlation is between Berkshire_Vol. and GDP with a correlation of 0.8768444159067754


In [12]:
# Filter data for the best stock and indicator
filtered_data = combined_data[[best_indicator, best_stock]].dropna()

# Create the binary target variable for the stock trend
filtered_data['Stock_Trend'] = (filtered_data[best_stock].diff() > 0).astype(int)
filtered_data = filtered_data.dropna()


In [13]:
# Features and target variable
X = filtered_data[[best_indicator, best_stock]]
y = filtered_data['Stock_Trend']

# Ensure there are enough samples for splitting
print(f"Number of samples for splitting: {len(y)}")
if len(y) < 10:
    raise ValueError("Not enough samples to split the dataset.")


Number of samples for splitting: 13


In [14]:
# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf_model, X, y, cv=cv, scoring='accuracy')
print(f"Random Forest Classifier CV Accuracy: {rf_scores.mean()}")


Random Forest Classifier CV Accuracy: 0.7666666666666666


In [15]:
# Standardize the features for SVM
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SVM Classifier
svm_model = SVC(kernel='rbf', random_state=42)
svm_scores = cross_val_score(svm_model, X_scaled, y, cv=cv, scoring='accuracy')
print(f"Support Vector Machine CV Accuracy: {svm_scores.mean()}")


Support Vector Machine CV Accuracy: 0.7666666666666666


In [16]:
# Save the results to a local path
local_path = 'C:/Users/tayla/OneDrive/Masaüstü/results/'

# Ensure the directory exists
os.makedirs(local_path, exist_ok=True)

filtered_data.to_csv(local_path + 'proj_3_filtered_data.csv')


In [17]:
# Save feature importance (only for Random Forest)
rf_model.fit(X, y)
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame(sorted(zip(importances, feature_names), reverse=True), columns=['Importance', 'Feature'])
feature_importance_df.to_csv(local_path + 'proj_3_rf_feature_importance.csv', index=False)
