In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from google.colab import drive
drive.mount('/content/drive')

# Read CSV files into DataFrames
folder_path = '/content/drive/MyDrive/CS418_Project/data/Data_wStocks/'
data_2020 = pd.read_csv(folder_path + 'rank_name_symbol_2020.csv')
data_2021 = pd.read_csv(folder_path + 'rank_name_symbol_2021.csv')
data_2022 = pd.read_csv(folder_path + 'rank_name_symbol_2022.csv')
data_2023 = pd.read_csv(folder_path + 'rank_name_symbol_2023.csv')

Mounted at /content/drive


In [2]:
# Clearn the data
data_2020.loc[:, 'm1-o':'gdp-12'] = data_2020.loc[:, 'm1-o':'gdp-12'].astype(float)
data_2021.loc[:, 'm1-o':'gdp-12'] = data_2021.loc[:, 'm1-o':'gdp-12'].astype(float)
data_2022.loc[:, 'm1-o':'gdp-12'] = data_2022.loc[:, 'm1-o':'gdp-12'].astype(float)
data_2023.loc[:, 'm1-o':'gdp-12'] = data_2023.loc[:, 'm1-o':'gdp-12'].astype(float)

# Remove commas from the 'Revenue' column
data_2020['Revenue'] = data_2020.loc[:,'Revenue'].str.replace(',', '')
data_2021['Revenue'] = data_2021.loc[:,'Revenue'].str.replace(',', '')
data_2022['Revenue'] = data_2022.loc[:,'Revenue'].str.replace(',', '')
data_2023['Revenue'] = data_2023.loc[:,'Revenue'].str.replace(',', '')

# Remove $ from the 'Revenue' column
data_2020['Revenue'] = data_2020.loc[:,'Revenue'].str.replace('$', '')
data_2021['Revenue'] = data_2021.loc[:,'Revenue'].str.replace('$', '')
data_2022['Revenue'] = data_2022.loc[:,'Revenue'].str.replace('$', '')
data_2023['Revenue'] = data_2023.loc[:,'Revenue'].str.replace('$', '')

# Convert the 'Revenue' column to float
data_2020['Revenue'] = data_2020.loc[:,'Revenue'].astype(float)
data_2021['Revenue'] = data_2021.loc[:,'Revenue'].astype(float)
data_2022['Revenue'] = data_2022.loc[:,'Revenue'].astype(float)
data_2023['Revenue'] = data_2023.loc[:,'Revenue'].astype(float)

# Remove commas from the 'Profit' column
data_2020['Profit'] = data_2020.loc[:,'Profit'].str.replace(',', '')
data_2021['Profit'] = data_2021.loc[:,'Profit'].str.replace(',', '')
data_2022['Profit'] = data_2022.loc[:,'Profit'].str.replace(',', '')
data_2023['Profit'] = data_2023.loc[:,'Profit'].str.replace(',', '')

# Remove $ from the 'Profit' column
data_2020['Profit'] = data_2020.loc[:,'Profit'].str.replace('$', '')
data_2021['Profit'] = data_2021.loc[:,'Profit'].str.replace('$', '')
data_2022['Profit'] = data_2022.loc[:,'Profit'].str.replace('$', '')
data_2023['Profit'] = data_2023.loc[:,'Profit'].str.replace('$', '')

# Remove () from the 'Profit' column
data_2020['Profit'] = data_2020.loc[:,'Profit'].str.replace(')', '')
data_2021['Profit'] = data_2021.loc[:,'Profit'].str.replace(')', '')
data_2022['Profit'] = data_2022.loc[:,'Profit'].str.replace(')', '')
data_2023['Profit'] = data_2023.loc[:,'Profit'].str.replace(')', '')
data_2020['Profit'] = data_2020.loc[:,'Profit'].str.replace('(', '')
data_2021['Profit'] = data_2021.loc[:,'Profit'].str.replace('(', '')
data_2022['Profit'] = data_2022.loc[:,'Profit'].str.replace('(', '')
data_2023['Profit'] = data_2023.loc[:,'Profit'].str.replace('(', '')

# Convert the 'Profit' column to float
data_2020['Profit'] = data_2020.loc[:,'Profit'].astype(float)
data_2021['Profit'] = data_2021.loc[:,'Profit'].astype(float)
data_2022['Profit'] = data_2022.loc[:,'Profit'].astype(float)
data_2023['Profit'] = data_2023.loc[:,'Profit'].astype(float)

In [3]:
# Add a 'Year' column to each DataFrame
data_2020['Year'] = 2020
data_2021['Year'] = 2021
data_2022['Year'] = 2022
data_2023['Year'] = 2023

# Remove the null values
data_2020_cleaned = data_2020.dropna()
data_2021_cleaned = data_2021.dropna()
data_2022_cleaned = data_2022.dropna()
data_2023_cleaned = data_2023.dropna()

In [4]:
df_combined = pd.concat([data_2020_cleaned, data_2021_cleaned, data_2022_cleaned, data_2023_cleaned], ignore_index=True)
df_temp = df_combined

In [5]:
grouped_by_sector_year = df_combined.groupby(['Sector', 'Year'])

In [6]:
# # Get the columns for stock price on monthly bases
monthly_columns = ['gdp-1', 'gdp-2', 'gdp-3', 'gdp-4', 'gdp-5', 'gdp-6', 'gdp-7', 'gdp-8', 'gdp-9', 'gdp-10', 'gdp-11', 'gdp-12']

In [7]:
df_combined['GDP'] = df_combined[monthly_columns].mean(axis=1).round(2)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error


# features for ranking prediction (e.g., Revenue, Profit, etc.)
features = ['Revenue', 'Profit', 'm1-o', 'm1-c', 'm2-o', 'm2-c', 'm3-o', 'm3-c', 'gdp-1', 'gdp-2', 'gdp-3', 'gdp-4']

# Prepare data for ranking prediction
X_ranking = df_combined[features]
y_ranking = df_combined['Rank']

# Split data into training and testing sets
X_train_ranking, X_test_ranking, y_train_ranking, y_test_ranking = train_test_split(X_ranking, y_ranking, test_size=0.2, random_state=42)

# Initialize and train the regression model
regression_model = LinearRegression()
regression_model.fit(X_train_ranking, y_train_ranking)

# Predict rankings for 2024
predicted_rankings_2024 = regression_model.predict(X_test_ranking)

# calculate Mean Absolute Error (MAE)
mae_ranking = mean_absolute_error(y_test_ranking, predicted_rankings_2024)
print("Mean Absolute Error (Ranking Prediction):", mae_ranking)


Mean Absolute Error (Ranking Prediction): 102.40256998319695


In [9]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

# Feature Engineering: Add polynomial features
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_ranking)
X_test_poly = poly.transform(X_test_ranking)

# Model Selection: Ridge Regression with Polynomial Features
ridge_model = Ridge(alpha=0.1)  # You can adjust the alpha parameter for regularization
ridge_model.fit(X_train_poly, y_train_ranking)

# Predict rankings for 2024
predicted_rankings_2024 = ridge_model.predict(X_test_poly)

# Evaluate the model using Mean Absolute Error (MAE)
mae_ranking = mean_absolute_error(y_test_ranking, predicted_rankings_2024)
print("Mean Absolute Error (Ranking Prediction):", mae_ranking)


Mean Absolute Error (Ranking Prediction): 90.33970767728187


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd


# Select features for ranking prediction
features = ['Revenue', 'Profit', 'm1-o', 'm1-c', 'm2-o', 'm2-c', 'm3-o', 'm3-c', 'gdp-1', 'gdp-2', 'gdp-3', 'gdp-4']

# Prepare data for ranking prediction
X_ranking = df_combined[features]
y_ranking = df_combined['Rank']

# Split data into training and testing sets
X_train_ranking, X_test_ranking, y_train_ranking, y_test_ranking = train_test_split(X_ranking, y_ranking, test_size=0.2, random_state=42)

# Feature Selection: Select the top k features based on ANOVA F-value
selector = SelectKBest(score_func=f_regression, k=8)
X_train_selected = selector.fit_transform(X_train_ranking, y_train_ranking)
X_test_selected = selector.transform(X_test_ranking)

# Model Selection: Ensemble of RandomForestRegressor, GradientBoostingRegressor, XGBRegressor, and LGBMRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
lgbm_model = LGBMRegressor(n_estimators=100, random_state=42, verbosity = 0)

models = [
    ('rf', rf_model),
    ('gb', gb_model),
    ('xgb', xgb_model),
    ('lgbm', lgbm_model)
]

# Train and predict with each model
predicted_rankings_2024 = []
for name, model in models:
    model.fit(X_train_selected, y_train_ranking)
    predicted_rankings = model.predict(X_test_selected)
    predicted_rankings_2024.append(predicted_rankings)

# Ensemble predictions using VotingRegressor
voting_regressor = VotingRegressor(models)
voting_regressor.fit(X_train_selected, y_train_ranking)
ensemble_predictions = voting_regressor.predict(X_test_selected)

# Evaluate the ensemble model using Mean Absolute Error (MAE)
mae_ranking = mean_absolute_error(y_test_ranking, ensemble_predictions)
print("Mean Absolute Error (Ranking Prediction):", mae_ranking)


Mean Absolute Error (Ranking Prediction): 20.418432392811948


In this analysis, various machine learning techniques were employed to predict the rankings of Fortune 500 companies based on a diverse set of features including revenue, profit, market data (open and close prices for three months), and GDP data. Initially, a simple linear regression model was applied, resulting in a Mean Absolute Error (MAE) of 102.40. Subsequently, polynomial features were incorporated, and Ridge Regression was implemented, leading to a slight improvement in MAE to 90.34. Following this, a more advanced ensemble approach was adopted, combining RandomForestRegressor, GradientBoostingRegressor, XGBRegressor, and LGBMRegressor models. This ensemble model achieved a significantly lower MAE of 20.42, indicating a superior predictive accuracy. The MAE value signifies the average absolute difference between the predicted and actual rankings of Fortune 500 companies, suggesting that the ensemble model's predictions deviate from the actual rankings by approximately 20 positions on average. This level of accuracy underscores the effectiveness of the ensemble model in predicting the rankings of Fortune 500 companies, given the total number of companies in the list.