In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

from module.file_list import df_files

In [2]:
def csv_files_to_df():
    file_path_list = df_files()
    dfs = []
    for file_path in file_path_list:
        df = pd.read_csv(file_path)
        dfs.append(df)
    return pd.concat(dfs)

In [3]:
df = csv_files_to_df()

In [4]:
label = ['馬名','性齢','斤量','厩舎','距離','着 順']

In [5]:
df = df[label]
df

Unnamed: 0,馬名,性齢,斤量,厩舎,距離,着 順
0,シゲルヒラマサ,牡3,56.0,美浦水野,1200,0
1,タイトルリーフ,牝3,54.0,美浦武市,1200,0
2,ノボホウセイ,牝3,54.0,美浦尾関,1200,1
3,ビバラビダ,牝3,54.0,美浦石栗,1200,1
4,ブライトピスケス,牡3,56.0,美浦菅原,1200,1
...,...,...,...,...,...,...
47816,ショウナンバニラ,牝5,53.0,美浦武市,1200,4
47817,クルークヴァール,牡6,54.0,栗東浅見,1200,4
47818,キーダイヤ,牝4,52.0,栗東藤岡,1200,4
47819,ショウナンアリアナ,牝5,54.0,栗東高野,1200,5


In [6]:
def encode_categorical(df,cols):
    for col in cols:
        le = LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null),index=not_null.index)
    return df

In [7]:
df_encoded = encode_categorical(df,cols=['馬名','性齢','厩舎'])
df_encoded.dropna(inplace=True)
df_encoded

Unnamed: 0,馬名,性齢,斤量,厩舎,距離,着 順
0,11314,23,56.0,401,1200,0
1,15305,13,54.0,397,1200,0
2,20301,13,54.0,360,1200,1
3,22294,13,54.0,429,1200,1
4,23772,23,56.0,436,1200,1
...,...,...,...,...,...,...
47816,11976,15,53.0,397,1200,4
47817,7471,26,54.0,209,1200,4
47818,6716,14,52.0,243,1200,4
47819,11895,15,54.0,281,1200,5


In [8]:
X = df_encoded[['馬名','性齢','斤量','厩舎','距離']]
y = df['着 順']
X

Unnamed: 0,馬名,性齢,斤量,厩舎,距離
0,11314,23,56.0,401,1200
1,15305,13,54.0,397,1200
2,20301,13,54.0,360,1200
3,22294,13,54.0,429,1200
4,23772,23,56.0,436,1200
...,...,...,...,...,...
47816,11976,15,53.0,397,1200
47817,7471,26,54.0,209,1200
47818,6716,14,52.0,243,1200
47819,11895,15,54.0,281,1200


In [9]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [10]:
clf = RandomForestClassifier(random_state=77)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

In [11]:
f"{int(accuracy * 1000) / 10}%"

'23.3%'

In [12]:
filename = 'model/model_grade02.sav'
pickle.dump(clf, open(filename,'wb'))