# Machine Learning in Fundamental Analysis
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
import tushare as ts

## Data parsing

In [None]:
# pro = ts.pro_api('c1811742dad5bc1457bbdc81d14ef9e05c4bd17abdf9591bd730ddc5')
# ts.set_token('c1811742dad5bc1457bbdc81d14ef9e05c4bd17abdf9591bd730ddc5')

In [None]:
year = 2021
quarter = 3
df_profit = ts.get_profit_data(year,quarter)
df_op = ts.get_operation_data(year,quarter)

## Data preprocessing

In [None]:
# filter only sh stocks and remove *ST stocks
df_profit_600 = df_profit[(df_profit['code'].str.startswith('600') & (df_profit['name'].str.contains('ST') == False))]
df_op_600 = df_op[(df_op['code'].str.startswith('600') & (df_op['name'].str.contains('ST') == False))]

In [None]:
# set stock code column as index and inner join two tables
df_profit_600.set_index('code',inplace=True)
df_op_600.set_index('code',inplace=True)

# drop name column to avoid duplication when merging 
df_op_600.drop('name',axis=1,inplace=True)
df_profit_op_600 = df_profit_600.join(df_op_600,how='inner')

In [None]:
# get the stock price, sh index as of Nov 30th and Sep 30th
old_date='2021-09-30'
new_date='2021-11-30'

# get_hist_data returns a 1 line df each time, but what we need is a value
# so we have to slice the correct column ['close'] which returns a series, then convert it to values which returns an array, and slice the first item 
# however if there is no data on that day, an empty series would be returned, and slicing the first item would throw an error, so we also need to make sure the series is not empty by checking its len
df_profit_op_600['old_price'] = [ts.get_hist_data(i,old_date,old_date)['close'].values[0] 
                                 if len(ts.get_hist_data(i,old_date,old_date)['close']) !=0 else np.nan for i in df_profit_op_600.index]

df_profit_op_600['new_price'] = [ts.get_hist_data(i,new_date,new_date)['close'].values[0] 
                                 if len(ts.get_hist_data(i,new_date,new_date)['close']) !=0 else np.nan for i in df_profit_op_600.index]


df_profit_op_600['old_index'] = ts.get_hist_data('sh',old_date,old_date)['close'].values[0]
df_profit_op_600['new_index'] = ts.get_hist_data('sh',new_date,new_date)['close'].values[0]

In [None]:
# calculate the pct change rate for each stock and sh index
df_profit_op_600['price_change'] = (df_profit_op_600['new_price'] - df_profit_op_600['old_price'])/df_profit_op_600['old_price']
df_profit_op_600['index_change'] = (df_profit_op_600['new_index'] - df_profit_op_600['old_index'])/df_profit_op_600['old_index']

# compare both rates, see if a stock beats the market or not, *1 turns true/false into 1/0
df_profit_op_600['beat'] = (df_profit_op_600['price_change'] >= df_profit_op_600['index_change']) * 1

In [None]:
# drop useless features
df_profit_op_600 = df_profit_op_600.drop(['net_profits','business_income',
                                          'arturndays','inventory_days','currentasset_days',
                                          'old_price','new_price','old_sh_index','new_sh_index','price_change','index_change'],axis=1)

# drop rows with missing values
df_profit_op_600.dropna(inplace=True)

In [None]:
# split data into features and lables
X = df_profit_op_600.drop(['name','beat'], axis=1)
y = df_profit_op_600['beat']
X = scale(X)

# split data into training and testing set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## Train and test our model

In [None]:
# choose a model svm
# svm takes the nearest training data points (support vectors) of any class, and draw the boundary that has the largest distance to those support vectors, since in general the larger the margin the lower the generalization error of the classifier. 
# but since problems are usually not always perfectly separable with a boundary, so we allow some samples to be misclassified
from sklearn import svm

clf_svm = svm.SVC()

# train the model
clf_svm.fit(X_train,y_train)

# test the model
clf_svm.score(X_test,y_test)

# compute a 5-fold cross validation score on the whole dataset
# it lets the model to train on a random part of the data and test it on the rest, repeat 5 times
cross_val_score(clf_svm,X,y)

In [None]:
# choose a model knn
# knn makes predictions by figuring out what are the labels of the most similar samples, and use the average of those labels as y_pred
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors=3)

# train the model
clf_knn.fit(X_train,y_train)

# test the model
clf_knn.score(X_test,y_test)

cross_val_score(clf_knn,X,y)

In [None]:
# choose a model
# a random forest has many decision trees (default 100 trees), each decision tree is trained and then used to predict the result, the final result is derived by taking the average of all trees' predictions(or by the majority rule)
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()

# train the model
clf_rf.fit(X_train,y_train)

# test the model
clf_rf.score(X_test,y_test)

cross_val_score(clf_rf,X,y)

it is prominent that RandomForestClassifier has an overall higher accuracy (approaching 70%) than SVM and KNN. We can feed in Q4 data later when they are available to this model to predict which stocks could beat the market and construct an asset pool from which we can pick our targets and utilize our technical strategies to trade.

## Save, load and predict
Once we have trained our model, we want to save it so that we dont need to re-train it from scratch again. 

In [None]:
import pickle

# save a model
with open('beat_clf.pkl','wb') as f:
    pickle.dump(clf_rf,f)

In [None]:
# load a model
with open('beat_clf.pkl','rb') as f:
    clf_new = pickle.load(f)

In [None]:
clf_new.predict(X_test)

## *Get more data (more dimensions and more samples)

### Define data processing functions

In [None]:
def get_stats(year,quarter):
    '''
    This function returns the stats data for all stocks at given year-quarter combinations
    year = yyyy
    quarter = 1,2,3 or 4
    '''
    df_profit = ts.get_profit_data(year,quarter)
    df_op = ts.get_operation_data(year,quarter)
    df_growth = ts.get_growth_data(year,quarter)
    df_debt = ts.get_debtpaying_data(year,quarter)
    df_cash = ts.get_cashflow_data(year,quarter)
    
    return df_profit,df_op,df_growth,df_debt,df_cash

In [None]:
def process_stats(df):
    '''
    Filter only SH stocks that are not *ST
    '''
    new_df = df[(df['code'].str.startswith('600') & (df['name'].str.contains('ST') == False))]
    new_df.set_index('code',inplace=True)
    
    return new_df

In [None]:
def add_stats(df,old_date,new_date,index_name='sh'):
    '''
    Return the stock price and specific index on given dates, and judge whether a stock beats the market
    old_date/new_date = yyyy-mm-dd
    '''
    df['old_price'] = [ts.get_hist_data(i,old_date,old_date)['close'].values[0] if len(ts.get_hist_data(i,old_date,old_date)['close']) !=0 else np.nan for i in df.index]
    df['new_price'] = [ts.get_hist_data(i,new_date,new_date)['close'].values[0] if len(ts.get_hist_data(i,new_date,new_date)['close']) !=0 else np.nan for i in df.index]
    df['old_index'] = ts.get_hist_data(index_name,old_date,old_date)['close'].values[0]
    df['new_index'] = ts.get_hist_data(index_name,new_date,new_date)['close'].values[0]
    
    df['price_change'] = (df['new_price'] - df['old_price'])/df['old_price']
    df['index_change'] = (df['new_index'] - df['old_index'])/df['old_index']
    
    df['beat'] = (df['price_change'] >= df['index_change']) * 1
    
    return df

In [None]:
def drop_stats(df):
    '''
    Drop useless columns and na rows
    '''
    new_df = df.drop(['net_profits','business_income','arturndays','inventory_days','currentasset_days','old_price','new_price','old_index','new_index','price_change','index_change'],axis=1)
    new_df.dropna(inplace=True)
    
    return new_df

### Get 2021Q1 data

In [None]:
year = 2021
quarter = 1
stats_dfs = get_stats(year,quarter)

In [None]:
old_date='2021-03-31'
new_date='2021-05-31'

main_df = pd.DataFrame()

for each_df in stats_dfs:
    each_df_new = process_stats(each_df)
    if main_df.empty:
        main_df = each_df_new
    else:
        each_df_new.drop('name',axis=1,inplace=True)
        main_df = main_df.join(each_df_new,how='inner')
    
main_df = add_stats(main_df,old_date,new_date,'sh')
main_df = drop_stats(main_df)

In [None]:
main_df.to_csv('df_2021Q1.csv',encoding='gbk')

### Get 2021Q2 data

In [None]:
year = 2021
quarter = 2
stats_dfs = get_stats(year,quarter)

old_date='2021-06-30'
new_date='2021-09-30'

main_df = pd.DataFrame()

for each_df in stats_dfs:
    each_df_new = process_stats(each_df)
    if main_df.empty:
        main_df = each_df_new
    else:
        each_df_new.drop('name',axis=1,inplace=True)
        main_df = main_df.join(each_df_new,how='inner')
    
main_df = add_stats(main_df,old_date,new_date,'sh')
main_df = drop_stats(main_df)

main_df.to_csv('df_2021Q2.csv',encoding='gbk')

### Get 2021Q3 data

In [None]:
year = 2021
quarter = 3
stats_dfs = get_stats(year,quarter)

old_date='2021-09-30'
new_date='2021-11-30'

main_df = pd.DataFrame()

for each_df in stats_dfs:
    each_df_new = process_stats(each_df)
    if main_df.empty:
        main_df = each_df_new
    else:
        each_df_new.drop('name',axis=1,inplace=True)
        main_df = main_df.join(each_df_new,how='inner')
    
main_df = add_stats(main_df,old_date,new_date,'sh')
main_df = drop_stats(main_df)

main_df.to_csv('df_2021Q3.csv',encoding='gbk')

### Reading data and start training

In [24]:
Qs = ['Q1','Q2','Q3']
batches = [pd.read_csv('df_2021'+i+'.csv',index_col=0,encoding='gbk') for i in Qs]

In [28]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()

for each_batch in batches:
    X = each_batch.drop(['name','beat'], axis=1)
    y = each_batch['beat']
    X = scale(X)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    
    clf_rf.fit(X_train,y_train)
    print(f'training score:{clf_rf.score(X_train,y_train):.2f}')
    print(f'testing score:{clf_rf.score(X_test,y_test):.2f}')
    print(f'cross val score:{cross_val_score(clf_rf,X,y).mean():.2f}')


training score:1.00
testing score:0.69
cross val score:0.64
training score:1.00
testing score:0.65
cross val score:0.65
training score:1.00
testing score:0.62
cross val score:0.69


In [22]:
from sklearn import svm
clf_svm = svm.SVC()

for each_batch in batches:
    X = each_batch.drop(['name','beat'], axis=1)
    y = each_batch['beat']
    X = scale(X)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    
    clf_svm.fit(X_train,y_train)
    print(f'training score:{clf_rf.score(X_train,y_train):.2f}')
    print(f'testing score:{clf_rf.score(X_test,y_test):.2f}')
    print(f'cross val score:{cross_val_score(clf_rf,X,y).mean():.2f}')

import pickle
with open('beat_clf.pkl','wb') as f:
    pickle.dump(clf_svm,f)

training score:0.58
testing score:0.64
cross val score:0.65
training score:0.41
testing score:0.41
cross val score:0.67
training score:0.93
testing score:0.95
cross val score:0.68


In [34]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=3)

for each_batch in batches:
    X = each_batch.drop(['name','beat'], axis=1)
    y = each_batch['beat']
    X = scale(X)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    
    clf_knn.fit(X_train,y_train)
    print(f'training score:{clf_rf.score(X_train,y_train):.2f}')
    print(f'testing score:{clf_rf.score(X_test,y_test):.2f}')
    print(f'cross val score:{cross_val_score(clf_rf,X,y).mean():.2f}')
    

training score:0.61
testing score:0.63
cross val score:0.64
training score:0.38
testing score:0.37
cross val score:0.66
training score:0.93
testing score:0.92
cross val score:0.68


In the above examples, we trained three batches of data(Q1,Q2,Q3) and tested them separately to evaluate which model performs better. On average, all models are having a similar level of prediction accuracy, so we can choose any one from them and save to local for our backtesting later.