In [36]:
import pandas as pd
import os
import yfinance as yf
import random
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import shap



In [37]:
df = pd.read_csv('realone.csv')

In [38]:
df['gain/lost'] = df['price_diff_1mo'].apply(lambda x: 'up' if x > 0 else 'down')

In [39]:
df = df.drop(columns = ['Unnamed: 0'])

In [40]:
forest_df = df.drop(columns = ['symbol', 'previous_price', 'averageAnalystRating', 'previousClose', 'fiftyTwoWeekHighChangePercent',\
                               'fiftyTwoWeekLowChangePercent', 'price_diff_1mo', 'trailingPE', 'forwardPE'])

In [41]:
forest_df

Unnamed: 0,trailingEps,forwardEps,profitMargins,revenuePerShare,quickRatio,currentRatio,debtToEquity,gain/lost
0,-2.61,-1.90,0.00000,0.032,10.674,11.576,7.167,down
1,1.20,2.13,0.09531,13.211,2.188,2.789,0.000,down
2,-2.89,-3.46,-1.93691,1.494,0.896,1.155,16.869,down
3,-1.32,,-0.14883,2.429,1.037,1.101,6.626,down
4,-6525.00,0.00,-0.85970,74.726,0.069,0.247,1047.930,down
...,...,...,...,...,...,...,...,...
4831,0.02,0.00,0.05249,3.091,1.410,1.434,0.297,up
4832,-1.28,-0.71,0.00000,0.000,8.525,9.197,0.452,down
4833,0.00,0.06,-0.00170,2.633,1.158,1.949,43.982,up
4834,2.29,2.26,0.13206,18.932,7.133,7.847,137.005,up


In [42]:
forest_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN (which now includes former infs)
forest_df.dropna(inplace=True)
forest_df

Unnamed: 0,trailingEps,forwardEps,profitMargins,revenuePerShare,quickRatio,currentRatio,debtToEquity,gain/lost
0,-2.61,-1.90,0.00000,0.032,10.674,11.576,7.167,down
1,1.20,2.13,0.09531,13.211,2.188,2.789,0.000,down
2,-2.89,-3.46,-1.93691,1.494,0.896,1.155,16.869,down
4,-6525.00,0.00,-0.85970,74.726,0.069,0.247,1047.930,down
5,0.00,0.00,0.00000,0.000,0.000,0.000,0.000,up
...,...,...,...,...,...,...,...,...
4831,0.02,0.00,0.05249,3.091,1.410,1.434,0.297,up
4832,-1.28,-0.71,0.00000,0.000,8.525,9.197,0.452,down
4833,0.00,0.06,-0.00170,2.633,1.158,1.949,43.982,up
4834,2.29,2.26,0.13206,18.932,7.133,7.847,137.005,up


In [43]:
train, test = train_test_split(forest_df, test_size=.2, random_state=123)
train, validate = train_test_split(train, test_size=.3, random_state=123)
x_train = train.drop(columns = ['gain/lost'])
y_train = train['gain/lost']

x_val = validate.drop(columns = ['gain/lost'])
y_val = validate['gain/lost']

x_test = test.drop(columns = ['gain/lost'])
y_test = test['gain/lost']

In [45]:
rf = RandomForestClassifier(
    bootstrap=True,
    class_weight='balanced',         
    criterion='gini',               
    min_samples_leaf=10,            
    n_estimators=200,               
    max_depth=6,                    
    random_state=123)
rf.fit(x_train, y_train)
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)
y_pred = rf.predict(x_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

        down       0.55      0.71      0.62       888
          up       0.79      0.65      0.71      1461

    accuracy                           0.67      2349
   macro avg       0.67      0.68      0.67      2349
weighted avg       0.70      0.67      0.68      2349



In [46]:
forest_df

Unnamed: 0,trailingEps,forwardEps,profitMargins,revenuePerShare,quickRatio,currentRatio,debtToEquity,gain/lost
0,-2.61,-1.90,0.00000,0.032,10.674,11.576,7.167,down
1,1.20,2.13,0.09531,13.211,2.188,2.789,0.000,down
2,-2.89,-3.46,-1.93691,1.494,0.896,1.155,16.869,down
4,-6525.00,0.00,-0.85970,74.726,0.069,0.247,1047.930,down
5,0.00,0.00,0.00000,0.000,0.000,0.000,0.000,up
...,...,...,...,...,...,...,...,...
4831,0.02,0.00,0.05249,3.091,1.410,1.434,0.297,up
4832,-1.28,-0.71,0.00000,0.000,8.525,9.197,0.452,down
4833,0.00,0.06,-0.00170,2.633,1.158,1.949,43.982,up
4834,2.29,2.26,0.13206,18.932,7.133,7.847,137.005,up


In [47]:
y_val_pred = rf.predict(x_val)


In [35]:
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

        down       0.52      0.19      0.28       401
          up       0.62      0.88      0.73       606

    accuracy                           0.61      1007
   macro avg       0.57      0.54      0.50      1007
weighted avg       0.58      0.61      0.55      1007



In [48]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=123)
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

# Train model on oversampled data
rf.fit(x_train_sm, y_train_sm)

In [50]:
y_val_pred = rf.predict(x_train_sm)

In [51]:
print(classification_report(y_train_sm, y_val_pred))

              precision    recall  f1-score   support

        down       0.65      0.75      0.70      1461
          up       0.71      0.60      0.65      1461

    accuracy                           0.68      2922
   macro avg       0.68      0.68      0.67      2922
weighted avg       0.68      0.68      0.67      2922

