In [1]:
# General Python
import pickle
import string
import os.path

from datetime import datetime

# Data Science
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
def create_good_char_list(accepted_characters:str='') -> list:
    accepted_characters = list(accepted_characters)
    accepted_characters.extend(list(string.digits) + list(string.ascii_lowercase))
    return accepted_characters

In [3]:
def create_df(off_filepath,nrows,chunk_size,sep, cols):

    df = pd.DataFrame()
    for chunk in pd.read_csv(off_filepath,
                     nrows=nrows,
                     usecols=cols.keys(),
                     sep=sep,
                     dtype=cols,
                     chunksize=chunk_size):
        df = pd.concat([df, chunk.dropna()])
    return df

In [4]:
def train_model(nrows: int=10_000,
                chunk_size: int=None,
                off_filepath: str='../data/raw/en.openfoodfacts.org.products.csv.gz',
                accepted_characters: str="-:",
                model_folder: str="../models",
                model_filename: str="off_rf_model",
                coeffs_in_log_n: int=50,
                sep:str ='\t',
                ):
    
    if chunk_size == None: chunk_size = round(nrows / 20)
    
    cols = {'product_name' : 'string',
            'energy-kcal_100g' : 'float32',
            'ingredients_tags': 'string',}
    
    df = create_df(off_filepath=off_filepath,
                   nrows=nrows,
                   chunk_size=chunk_size,
                   sep=sep,
                   cols=cols)
    
    good_char_list = create_good_char_list(accepted_characters=accepted_characters)
    
    df['ingredients_tags'] = df['ingredients_tags'].str.split(',').map(
            lambda x : [el[3:] for el in x if (el[:2] == 'en')
                        and el[3:][0] not in string.digits        
                        and all([char in good_char_list for char in el])])
    
    df = pd.concat([df, pd.get_dummies(df['ingredients_tags'].explode()).groupby(level=0).sum()], axis=1)
    y = df['energy-kcal_100g']
    X = df[[col for col in df.columns if col not in list(cols.keys())]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
   
    off_rf_model = RandomForestRegressor(random_state=42)
    off_rf_model.fit(X_train, y_train)
    
    pickle.dump(off_rf_model, open(os.path.join(model_folder, model_filename + ".pickle"), "wb"))
    
    y_pred = off_rf_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    coeffs_list = pd.DataFrame(zip(X.columns, off_rf_model.feature_importances_)).sort_values(by=[1], ascending=False).head(coeffs_in_log_n).to_dict(orient='tight')['data']
    
    with open(os.path.join(model_folder, model_filename + ".log"), 'w') as log_file:
        log_file.write(f"Model name : '{model_filename}'. Trained on the {datetime.now().strftime('%d/%m/%Y %H:%M:%S')} \n")
        log_file.write(f"X_train : {len(X_train)}, X_test : {len(X_test)} \n")
        log_file.write(f"Number of features: {len(X.columns)} \n")
        log_file.write(f"Mean Absolute Error: {mae} \n")
        for el in coeffs_list:
            log_file.write(f"{el} \n")

In [5]:
%%time
train_model()

CPU times: total: 34.4 s
Wall time: 34.8 s
