# Logistic regression model

In [2]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import PrecisionRecallDisplay, ConfusionMatrixDisplay, RocCurveDisplay, confusion_matrix, auc
from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve, roc_curve, accuracy_score, roc_auc_score

In [3]:
def extract_duration(col):
    hours = pl.col(col).str.extract('(\d+)H').cast(pl.Int64).fill_null(0)
    minutes = pl.col(col).str.extract('(\d+)M').cast(pl.Int64).fill_null(0)
    duration = (hours * 60 + minutes).alias(col)
    return duration

In [4]:
cuisines = pd.read_pickle(cuisine_path+'cuisines.pkl').Keywords

In [5]:
X = pl.scan_parquet(
    data_path+'recipes.parquet'
# ).with_columns(
).select(
    pl.col('Name').str.split(' ').list.len().alias('Name_len'),
    pl.col('Description').str.split(' ').list.len().alias('Description_len'),
#     pl.col('RecipeCategory').str.split(' ').list.len().alias('RecipeCategory_len'),
    pl.col('Images').list.len().alias('Images_len'),
    pl.col('Keywords').list.len().alias('Keywords_len'),
    pl.col('RecipeIngredientParts').list.len().alias('RecipeIngredientParts_len'),
    pl.col('RecipeInstructions').list.len().alias('RecipeInstructions_len'),
    extract_duration('CookTime'),
    extract_duration('PrepTime'),
    extract_duration('TotalTime'),
    pl.col('Keywords').alias('cuisine')
).select(
    (pl.exclude('cuisine').cast(pl.Float64)+1).log(),
    pl.col('cuisine').list.set_intersection(cuisines.tolist()).list.len()>0,
).collect().to_pandas()

In [6]:
X.columns

Index(['Name_len', 'Description_len', 'Images_len', 'Keywords_len',
       'RecipeIngredientParts_len', 'RecipeInstructions_len', 'CookTime',
       'PrepTime', 'TotalTime', 'cuisine'],
      dtype='object')

In [7]:
pl.scan_parquet(
    data_path+'recipes.parquet'
).select(
    pl.col('AggregatedRating')
).collect()['AggregatedRating'].value_counts()

AggregatedRating,count
f64,u32
4.0,42829
3.0,9166
2.5,673
1.0,1677
3.5,3978
2.0,2049
5.0,174516
1.5,76
4.5,34330
,253223


In [8]:
y = pl.scan_parquet(
    data_path + 'recipes.parquet'
).select(
    pl.col('AggregatedRating').fill_null(1)>=4
).collect().to_pandas().squeeze()

In [10]:
model = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', random_state=0))
gs = GridSearchCV(model, {}, n_jobs=-1)

In [11]:
gs.fit(X, y)

In [12]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.751757,0.319737,0.136944,0.019952,{},0.685629,0.718987,0.741634,0.73462,0.678995,0.711973,0.025391,1


In [13]:
confusion_matrix(y, gs.predict(X), normalize='all').tolist()

[[0.45076810898018627, 0.06757292107242444],
 [0.22010767113797255, 0.26155129880941674]]