In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

penguin_df = pd.read_csv('penguins.csv')
penguin_df.dropna(inplace=True)
output = penguin_df['species']
features = penguin_df[['island', 'bill_length_mm', 'bill_depth_mm',
      'flipper_length_mm', 'body_mass_g', 'sex']]
features = pd.get_dummies(features, dtype=int)

output, uniques = pd.factorize(output, sort=True)
x_train, x_test, y_train, y_test = train_test_split(features, output, test_size=.8, random_state=15)

rfc = RandomForestClassifier(random_state=15)
rfc.fit(x_train.values, y_train)
y_pred = rfc.predict(x_test.values)
score = accuracy_score(y_pred, y_test)
print(f'Our accuracy score for this model is {score}')
# Verificar predicciones adicionales
# Verificar predicciones adicionales
print(f'Return {rfc.predict([[50,10,5,5,1,0,0,1,0]])} : {uniques[rfc.predict([[50,10,5,5,1,0,0,1,0]])]}')
print(f'Return {rfc.predict([[10,30,15,5,0,0,0,0,1]])} : {uniques[rfc.predict([[10,30,15,5,0,0,0,0,1]])]}')
print(f'Return {rfc.predict([[46,13,211,5500,0,0,1,0,1]])} : {uniques[rfc.predict([[46,13,211,5500,0,0,1,0,1]])]}')
print(f'Return {rfc.predict([[0,0,0,0,1,0,0,1,0]])} : {uniques[rfc.predict([[0,0,0,0,1,0,0,1,0]])]}')
print(list(uniques))

In [None]:
import polars as pl
import polars.selectors as cs
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

penguin_df = pl.read_csv('penguins.csv').drop_nulls()

expr = (
    penguin_df
    .select(pl.col('species'))
    .with_columns(rank = pl.col('species').rank('dense')-1)
)

uniques = expr.unique().sort('rank').select('species').to_numpy()
output = expr.select('rank').to_series().to_numpy()

features = (penguin_df
            .select(pl.all().exclude(['species', 'year']))
            .to_dummies(cs.string())
            .select(
                ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g',
                 'island_Biscoe', 'island_Dream', 'island_Torgersen', 'sex_female',
                 'sex_male']
            )
           )

x_train, x_test, y_train, y_test = train_test_split(
    features.to_numpy(), output, test_size=.8, random_state=15)
rfc = RandomForestClassifier(random_state=15)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
score = accuracy_score(y_pred, y_test)
print(f'Our accuracy score for this model is {score}')

# Verificar predicciones adicionales
print(f'UNIQUES: {uniques.tolist()}')

a = rfc.predict([[50,10,5,5,1,0,0,1,0]])
b = rfc.predict([[46,13,211,4500,1,0,0,1,0]])
c = rfc.predict([[47,18,192,3500,1,0,0,0,1]])

print(f'Return {a} : {uniques[a]}')
print(f'Return {b} : {uniques[b]}')
print(f'Return {c} : {uniques[c]}')

# rf_pickle = open('random_forest_penguin.pickle', 'wb')
# pickle.dump(rfc, rf_pickle)
# rf_pickle.close()

# output_pickle = open('output_penguin.pickle', 'wb')
# pickle.dump(uniques, output_pickle)
# output_pickle.close()

In [None]:
expr.unique().sort('rank').select('species')

In [None]:
expr.sort('rank').select('species').unique()

In [None]:
(
    penguin_df
    .select(pl.col('species'))
    .with_columns(rank = pl.col('species').rank('dense')-1)
    .unique()
    .sort('rank')
    
)

In [None]:
print(type(x_train))
print(type(output))
print(type(uniques))
print(type(y_pred))
print(type(y_train))

# Pandas

In [None]:
print(type(x_train))
print(type(output))
print(type(uniques))
print(type(y_pred))
print(type(y_train))

In [None]:
x_test.values

In [None]:
y_train

In [None]:
penguin_df.head()

In [None]:
for i in penguin_df.select('species').unique().to_series().to_list():
    print(penguin_df.filter(pl.col('species') == i).limit(1))

In [None]:
a = rfc.predict([[50,10,5,5,1,0,0,1,0]])
b = rfc.predict([[46,13,211,4500,1,0,0,1,0]])
c = rfc.predict([[47,18,192,3500,1,0,0,0,1]])

print(f'Return {a} : {uniques[a]}')
print(f'Return {b} : {uniques[b]}')
print(f'Return {c} : {uniques[c]}')


In [None]:
print(f'Return {a} : {uniques[a]}')