In [1]:
is_colab = True
if is_colab:
    data_dir = 'https://raw.githubusercontent.com/utm529f/sukkiri-ml-codes-polars/main/datafiles'
else:
    data_dir = '../datafiles'

In [2]:
import polars as pl
from sklearn import tree
df = pl.read_csv(f'{data_dir}/KvsT.csv')

In [3]:
x = df['体重':'年代']
t = df['派閥']
model = tree.DecisionTreeClassifier(max_depth = 1, random_state = 0)
model.fit(x, t)
data = pl.DataFrame({'体重':65, '年代':20}) # 予測用未知データ
print(model.predict(data)) # 予測派閥
model.predict_proba(data) # 派閥の確率

['きのこ']


array([[0.6, 0.4]])

In [4]:
import polars as pl
from sklearn.model_selection import train_test_split

df = pl.read_csv(f'{data_dir}/iris.csv')
df.head(2)

がく片長さ,がく片幅,花弁長さ,花弁幅,種類
f64,f64,f64,f64,str
0.22,0.63,0.08,0.04,"""Iris-setosa"""
0.17,0.42,0.35,0.04,"""Iris-setosa"""


In [5]:
# 平均値による欠損値の穴埋め
train2 = df.fill_null(strategy='mean')

# 特徴量と正解データに分割
x = train2[:'花弁幅']
t = train2['種類']

# 特徴量の標準化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
new = sc.fit_transform(x)

In [6]:
# 訓練データと検証用データに分割
x_train, x_val, y_train, y_val = train_test_split(new, t,
    test_size=0.2, random_state=0)

In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=0.1, random_state=0, solver='lbfgs')

In [8]:
model.fit(x_train, y_train)
print('学習データの精度:', model.score(x_train, y_train))
print('テストデータの精度:', model.score(x_val, y_val))

学習データの精度: 0.8666666666666667
テストデータの精度: 0.8333333333333334


In [9]:
model.coef_

array([[-0.53168668,  0.48573155, -0.52638431, -0.83213317],
       [ 0.09482289, -0.44707757, -0.00107471, -0.04407356],
       [ 0.4368638 , -0.03865398,  0.52745902,  0.87620673]])

In [10]:
x_new = [[1, 2, 3, 4]] # 新規データ

model.predict(x_new) # 新規データで予測

array(['Iris-virginica'], dtype='<U15')

In [11]:
model.predict_proba(x_new)

array([[4.02790964e-05, 3.02983301e-03, 9.96929888e-01]])

In [12]:
model.intercept_

array([-0.34443521,  0.48693635, -0.14250114])

In [13]:
# モジュールの読み込み
import polars as pl
from sklearn.model_selection import train_test_split
%matplotlib inline

In [14]:
df = pl.read_csv(f'{data_dir}/Survived.csv') # csvファイルの読み込み
# 確認する
df.head(2)

PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,f64,i64,i64,str,f64,str,str
1,0,3,"""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [15]:
jo1 = pl.col('Pclass') == 1
jo2 = pl.col('Survived') == 0
jo3 = pl.col('Age').is_null()
df = df.with_columns(
    pl.when((jo1) & (jo2) & (jo3))
    .then(43)
    .otherwise(pl.col('Age'))
    .alias('Age')
)

jo2 = pl.col('Survived') == 1
df = df.with_columns(
    pl.when((jo1) & (jo2) & (jo3))
    .then(35)
    .otherwise(pl.col('Age'))
    .alias('Age')
)

jo1 = pl.col('Pclass') == 2
jo2 = pl.col('Survived') == 0
df = df.with_columns(
    pl.when((jo1) & (jo2) & (jo3))
    .then(26)
    .otherwise(pl.col('Age'))
    .alias('Age')
)

jo2 = pl.col('Survived') == 1
df = df.with_columns(
    pl.when((jo1) & (jo2) & (jo3))
    .then(20)
    .otherwise(pl.col('Age'))
    .alias('Age')
)

jo1 = pl.col('Pclass') == 3
jo2 = pl.col('Survived') == 0
df = df.with_columns(
    pl.when((jo1) & (jo2) & (jo3))
    .then(43)
    .otherwise(pl.col('Age'))
    .alias('Age')
)

jo2 = pl.col('Survived') == 1
df = df.with_columns(
    pl.when((jo1) & (jo2) & (jo3))
    .then(35)
    .otherwise(pl.col('Age'))
    .alias('Age')
)

In [16]:
# 特徴量として利用する列のリスト
col = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

x = df[col]
t = df['Survived']

# Sex列は文字の列なのでダミー変数化
dummy = df[['Sex']].to_dummies(drop_first = True)
x = pl.concat([x, dummy], how='horizontal')
x.head(2)

Pclass,Age,SibSp,Parch,Fare,Sex_female
i64,f64,i64,i64,f64,u8
3,22.0,1,0,7.25,0
1,38.0,1,0,71.2833,1


In [17]:
# ランダムフォレストのインポート
from sklearn.ensemble import RandomForestClassifier
x_train, x_test, y_train, y_test = train_test_split(x, t,
    test_size=0.2, random_state=0)
model = RandomForestClassifier(n_estimators=200, random_state=0)

In [18]:
model.fit(x_train, y_train)

print('学習データの精度:', model.score(x_train, y_train))
print('テストデータの精度:', model.score(x_test, y_test))

学習データの精度: 0.9887640449438202
テストデータの精度: 0.8715083798882681


In [19]:
from sklearn import tree
model2 = tree.DecisionTreeClassifier(random_state = 0)
model2.fit(x_train, y_train)

print('学習データの精度:', model2.score(x_train, y_train))
print('テストデータの精度:', model2.score(x_test, y_test))

学習データの精度: 0.9887640449438202
テストデータの精度: 0.8044692737430168


In [20]:
importance = model.feature_importances_ # 特徴量重要度
pl.DataFrame({
    'column': x_train.columns,
    'importance': importance
})

column,importance
str,f64
"""Pclass""",0.079275
"""Age""",0.323185
"""SibSp""",0.046608
"""Parch""",0.033482
"""Fare""",0.262721
"""Sex_female""",0.254729


In [21]:
# アダブーストのインポート
from sklearn.ensemble import AdaBoostClassifier

# ベースとなるモデル
from sklearn.tree import DecisionTreeClassifier

x_train, x_test, y_train, y_test = train_test_split(x, t,
    test_size=0.2, random_state=0)
# 最大の深さ5の決定木を何個も作っていく
base_model = DecisionTreeClassifier(random_state=0, max_depth=5)

# 決定木を500個作成
model = AdaBoostClassifier(n_estimators = 500,
    random_state=0, estimator=base_model)
model.fit(x_train, y_train) # 学習

print('学習データの精度:', model.score(x_train, y_train)) # 訓練データの正解率
print('テストデータの精度:', model.score(x_test, y_test)) # テストデータの正解率

学習データの精度: 0.9887640449438202
テストデータの精度: 0.8603351955307262


In [22]:
# データの読み込み
df = pl.read_csv(f'{data_dir}/cinema.csv')
df = df.fill_null(strategy='mean')
x = df[:, 'SNS1':'original']
t = df['sales']
x_train, x_test, y_train, y_test = train_test_split(x, t,
    test_size=0.2, random_state=0)

# ランダムフォレスト回帰
from sklearn.ensemble import RandomForestRegressor
# 100個のモデルで並列学習
model = RandomForestRegressor(random_state=0,
    n_estimators=100)
model.fit(x_train, y_train)
model.score(x_test, y_test) # 決定係数

0.5563347234627347

In [23]:
# アダブースト回帰
from sklearn.ensemble import AdaBoostRegressor
 # ベースモデルとしての回帰木
from sklearn.tree import DecisionTreeRegressor

base = DecisionTreeRegressor(random_state=0,
    max_depth=3)

# 100個のモデルで逐次学習
model = AdaBoostRegressor(random_state=0,
    n_estimators=100, estimator=base)
model.fit(x_train, y_train)
model.score(x_test, y_test)# 決定係数

0.6748482902800903