In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
colab_notebooks_path = '/content/drive/My Drive/Colab Notebooks/'
historical_file_path = colab_notebooks_path + 'historical_df_cleaned.csv'


In [4]:
historical_df_cleaned = pd.read_csv(historical_file_path)
five_yr_df_cleaned = pd.read_csv('/content/drive/My Drive/Colab Notebooks/potential_aristocrats_5_clean.csv')
ten_yr_df_cleaned = pd.read_csv('/content/drive/My Drive/Colab Notebooks/potential_aristocrats_10_clean.csv')
fifteen_yr_df_cleaned = pd.read_csv('/content/drive/My Drive/Colab Notebooks/potential_aristocrats_15_clean.csv')
twenty_yr_df_cleaned = pd.read_csv('/content/drive/My Drive/Colab Notebooks/potential_aristocrats_20_clean.csv')

In [5]:
# Add 'is_aristocrat' column to both datasets
historical_df_cleaned['is_aristocrat'] = 1  # All historical companies are aristocrats
five_yr_df_cleaned['is_aristocrat'] = 0  # None of the potential companies are aristocrats
ten_yr_df_cleaned['is_aristocrat'] = 0
fifteen_yr_df_cleaned['is_aristocrat'] = 0
twenty_yr_df_cleaned['is_aristocrat'] = 0

In [6]:
print(historical_df_cleaned.info())
print(five_yr_df_cleaned.info())
print(ten_yr_df_cleaned.info())
print(fifteen_yr_df_cleaned.info())
print(twenty_yr_df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28920 entries, 0 to 28919
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   symbol                28920 non-null  object 
 1   date                  28920 non-null  object 
 2   close_price           28920 non-null  float64
 3   dividend_yield        28920 non-null  float64
 4   payout_ratio          28920 non-null  float64
 5   pe_ratio              28920 non-null  float64
 6   roe                   28920 non-null  float64
 7   free_cash_flow        28920 non-null  float64
 8   gross_margin          28920 non-null  float64
 9   operating_margin      28920 non-null  float64
 10  net_profit_margin     28920 non-null  float64
 11  price_to_book_ratio   28920 non-null  float64
 12  price_to_sales_ratio  28920 non-null  float64
 13  ev_ebitda             28920 non-null  float64
 14  revenue_growth_rate   28920 non-null  float64
 15  earnings_growth_rat

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Combine DataFrames
combined_df = pd.concat([historical_df_cleaned, twenty_yr_df_cleaned], ignore_index=True)

# Aggregate data by company
aggregated_df = combined_df.groupby('symbol').agg({
    'close_price': 'mean',
    'dividend_yield': 'mean',
    'payout_ratio': 'mean',
    'pe_ratio': 'mean',
    'roe': 'mean',
    'free_cash_flow': 'mean',
    'gross_margin': 'mean',
    'operating_margin': 'mean',
    'net_profit_margin': 'mean',
    'price_to_book_ratio': 'mean',
    'price_to_sales_ratio': 'mean',
    'ev_ebitda': 'mean',
    'revenue_growth_rate': 'mean',
    'earnings_growth_rate': 'mean',
    'market_cap': 'mean',
    'current_ratio': 'mean',
    'quick_ratio': 'mean',
    'employee_count': 'mean',
    'price_volatility': 'mean',
    'avg_volume': 'mean',
    'is_aristocrat': 'max'  # Assuming only historical_df_cleaned has 1s, so max will be 1 for those
}).reset_index()

# Feature and Target Variables
X = aggregated_df.drop(columns=['is_aristocrat', 'symbol'])
y = aggregated_df['is_aristocrat']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

# Evaluation
print("Random Forest Model Accuracy: ", rf.score(X_test, y_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score: ", roc_auc_score(y_test, y_prob))

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature Importances:\n", feature_importances)

# Predict probabilities for potential aristocrats
potential_df = aggregated_df[aggregated_df['is_aristocrat'] == 0]
potential_X = potential_df.drop(columns=['is_aristocrat', 'symbol'])
potential_prob = rf.predict_proba(potential_X)[:, 1]
top_companies = potential_df.assign(probability=potential_prob).sort_values(by='probability', ascending=False).head(5)

print("\nTop 5 Most Likely Companies to Become Dividend Aristocrats:\n", top_companies[['symbol', 'probability']])


Random Forest Model Accuracy:  0.9090909090909091

Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95        40
           1       0.00      0.00      0.00         4

    accuracy                           0.91        44
   macro avg       0.45      0.50      0.48        44
weighted avg       0.83      0.91      0.87        44


ROC-AUC Score:  0.49687499999999996

Feature Importances:
 market_cap              0.081296
price_to_book_ratio     0.076846
dividend_yield          0.076142
employee_count          0.070092
free_cash_flow          0.067073
revenue_growth_rate     0.064068
operating_margin        0.054225
earnings_growth_rate    0.051515
ev_ebitda               0.045412
price_to_sales_ratio    0.044979
price_volatility        0.042715
gross_margin            0.040205
close_price             0.039772
quick_ratio             0.039478
payout_ratio            0.039321
roe                     0.038816
avg_volume

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
