In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('data.csv')
df.head()
len(df)

50560

Start preprocessing data.


In [4]:
#create column 'future mkvalt' that holds the market value of the next year
condition1 = df['tic'].shift(-1) == df['tic']
condition2 = df['fyear'].shift(-1) == df['fyear'] + 1
df['future_mkvalt'] = np.where(condition1 & condition2, df['mkvalt'].shift(-1), np.nan)

#drop nulls
df = df.dropna(subset=['mkvalt', 'future_mkvalt'], how='all')

#percent change is the percent change in market value from one year to the next
df['percent_change'] = (df['future_mkvalt']-df['mkvalt'])/df['mkvalt']

#if percent change is greater than 20%, classify as undervalued
df['undervalued'] = (df['percent_change'] > 0.10)

In [5]:
#fill null expenses with 0
df['xrd'] = df['xad'].fillna(0)
df['xad'] = df['xad'].fillna(0)
df['xsga'] = df['xsga'].fillna(0)
df['xint'] = df['xsga'].fillna(0)

#combine all expenses columns into one column
expenses_list = ['xrd', 'xad', 'xsga', 'xint', 'nopio', 'cogs']
df['expenses'] = df[expenses_list].sum(axis=1)

df = df.drop(columns=expenses_list)

#delete duplicate rows due to formatting
condition = df['indfmt'] != "FS"
df = df[condition]

#drop unneccessary columns
df = df.drop(columns=['indfmt', 'consol', 'popsrc', 'datafmt', 'curcd', 'gvkey', 'costat', 'dlcch'])

#remove rows with null values
condition = df.isnull().sum(axis=1) < 1
df = df[condition]


In [6]:
df.head(20)

Unnamed: 0,tic,fyear,act,ap,at,ceq,che,dlc,dltt,dp,...,sale,spi,txp,txt,xido,mkvalt,future_mkvalt,percent_change,undervalued,expenses
0,AIR,2009.0,863.429,114.906,1501.042,746.906,79.37,100.833,336.191,38.93,...,1352.151,-4.302,3.263,20.986,0.0,777.8348,1049.8206,0.34967,True,1368.672
1,AIR,2010.0,913.985,185.096,1703.727,835.845,57.433,114.075,329.802,59.296,...,1775.782,-1.536,0.0,35.364,-3.313,1049.8206,485.2897,-0.53774,False,1754.213
2,AIR,2011.0,1063.272,201.405,2195.653,864.649,67.72,122.865,669.489,80.333,...,2074.498,-13.864,0.0,25.48,0.0,485.2897,790.0029,0.6279,True,2042.744
3,AIR,2012.0,1033.7,149.3,2136.9,918.6,75.3,86.4,622.2,108.6,...,2167.1,-21.1,0.0,26.7,0.0,790.0029,961.308,0.216841,True,2136.1
4,AIR,2013.0,1116.9,171.1,2199.5,999.5,89.2,69.7,564.3,113.4,...,2035.0,0.0,0.0,32.1,0.0,961.308,1046.3954,0.088512,False,1979.7
5,AIR,2014.0,954.1,142.3,1515.0,845.1,54.7,69.0,85.0,92.3,...,1594.3,-48.4,0.0,-28.5,64.7,1046.3954,842.5112,-0.194844,False,1678.7
6,AIR,2015.0,873.1,163.4,1442.1,865.8,31.2,12.0,136.1,70.8,...,1662.6,-0.4,1.1,18.8,7.2,842.5112,1200.3288,0.424704,True,1696.2
7,AIR,2016.0,888.5,177.4,1504.1,914.2,10.3,2.0,155.3,71.0,...,1767.6,2.6,12.3,24.1,6.3,1200.3288,1551.458,0.292528,True,1815.9
8,AIR,2017.0,942.7,170.0,1524.7,936.3,41.6,0.0,177.2,40.5,...,1748.3,0.0,0.0,3.5,-58.1,1551.458,1046.7709,-0.325299,False,1829.5
11,AAL,2011.0,6757.0,1007.0,23848.0,-7111.0,4739.0,1518.0,6702.0,981.0,...,24022.0,-886.0,0.0,0.0,0.0,117.3438,266.5571,1.271591,True,26599.0


In [7]:
df.isnull().sum()

tic               0
fyear             0
act               0
ap                0
at                0
ceq               0
che               0
dlc               0
dltt              0
dp                0
dvc               0
ib                0
intan             0
invt              0
ivao              0
lct               0
lt                0
ppent             0
rect              0
sale              0
spi               0
txp               0
txt               0
xido              0
mkvalt            0
future_mkvalt     0
percent_change    0
undervalued       0
expenses          0
dtype: int64

In [8]:
len(df)

18538

In [9]:
corr_matrix = df.corr()
print(corr_matrix)

                   fyear       act        ap        at       ceq       che  \
fyear           1.000000  0.003210  0.000124  0.015891 -0.002615  0.008577   
act             0.003210  1.000000  0.657500  0.811638  0.768014  0.880383   
ap              0.000124  0.657500  1.000000  0.687887  0.602299  0.365547   
at              0.015891  0.811638  0.687887  1.000000  0.894086  0.617711   
ceq            -0.002615  0.768014  0.602299  0.894086  1.000000  0.661532   
che             0.008577  0.880383  0.365547  0.617711  0.661532  1.000000   
dlc             0.025826  0.588066  0.471521  0.651468  0.474496  0.407422   
dltt            0.051340  0.604433  0.525467  0.869566  0.615527  0.428107   
dp              0.011507  0.631257  0.602629  0.894548  0.808548  0.457925   
dvc             0.020075  0.714735  0.510826  0.817942  0.729712  0.588120   
ib             -0.014349  0.738977  0.577473  0.744924  0.752700  0.626707   
intan           0.028313  0.517736  0.384861  0.748441  0.618572

In [10]:
len(df)

18538

In [11]:
X = df.drop(columns=['tic', 'fyear', 'future_mkvalt', 'percent_change', 'undervalued'])
y = df['undervalued']

In [12]:
y.value_counts()

False    9801
True     8737
Name: undervalued, dtype: int64

In [29]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

X_train = X.iloc[3800:]
X_test = X.iloc[:3800]
y_train = y.iloc[3800:]
y_test = y.iloc[:3800]



X_train.head()
len(X_train)

14738

In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
rf_model = RandomForestClassifier()

params = [{'n_estimators': [100, 200, 400, 800],
        'max_depth': [5, 10, 20, 40, 80],
        'min_samples_leaf': [3, 4, 5],
        'min_samples_split': [8, 10, 12],
          }]

gs = GridSearchCV(rf_model,
                      param_grid=params,
                      scoring='precision',
                      cv=3, verbose=2, return_train_score=False)
gs.fit(X_train_scaled, y_train)
gs.best_params_

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=   6.2s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=   6.6s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=   6.5s
[CV] END max_depth=5, min_samples_lea

[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=8, n_estimators=800; total time=  12.6s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=8, n_estimators=800; total time=  12.4s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=8, n_estimators=800; total time=  12.4s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   3.0s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   3.0s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   3.0s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=400; total time= 

[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  10.3s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  10.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  10.3s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  20.7s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  20.9s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  20.8s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   2.5s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   2.5s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=12, n_estimators=200;

[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=200; total time=   7.2s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=200; total time=   7.6s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=200; total time=   7.3s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=400; total time=  14.7s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=400; total time=  15.3s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=400; total time=  14.9s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=800; total time=  29.5s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=800; total time=  31.0s
[CV] END max_depth=20, min_samples_leaf=3, min_samples_split=12, n_estimators=800; total time=  29.7s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=8, n_estimators=100; 

[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   3.7s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   4.0s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   3.9s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   7.7s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   8.2s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   7.8s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=  15.5s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=  16.8s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=  15.6s
[CV] END max_depth=40, min_samples_leaf=3, min_samples_split=8, n_estimators=800; total tim

[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=8, n_estimators=800; total time=  29.9s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=8, n_estimators=800; total time=  32.1s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=8, n_estimators=800; total time=  30.4s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   3.7s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   3.8s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   3.7s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   7.4s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   7.9s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   7.5s
[CV] END max_depth=40, min_samples_leaf=5, min_samples_split=10, n_estimators=400; to

[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  15.3s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  15.9s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  15.6s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  40.8s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  35.7s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  33.8s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   4.0s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   4.3s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   3.8s
[CV] END max_depth=80, min_samples_leaf=4, min_samples_split=12, n_estimators=200;

{'max_depth': 5,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 200}

In [31]:
#rf gs:
#{'max_depth': 5,
# 'min_samples_leaf': 5,
# 'min_samples_split': 8,
 #'n_estimators': 200}
    
rf_model = RandomForestClassifier(max_depth = 5, min_samples_leaf=5, min_samples_split=8, n_estimators=200)
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)
precision_score(y_test, y_pred)

#precision is 0.54

0.5108695652173914

In [32]:
gains = 0
investment = 0
arr = y_test.index
len(arr)
for i in range(len(y_pred)):
    if(y_pred[i]):
        index = arr[i]
        investment = investment + 100
        change = 100  + 100*(df.loc[[index]]['percent_change'].values)
        gains = gains + change

print('Initial Investment: ' + str(investment))
print('Total Return: ' + str(gains))
print('Percent increase: ' + str((gains-investment)/investment))       


Initial Investment: 55200
Total Return: [79255.20736357]
Percent increase: [0.43578274]
