In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import psycopg2
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [3]:
db_config = {
    "dbname": "food",
    "user": "postgres",
    "password": "postgres",
    "host": "localhost", 
    "port": "5432"       
}

branded_food = "SELECT * FROM branded_food"
food = "SELECT * FROM food"
food_attribute = "SELECT * FROM food_attribute"
food_attribute_type = "SELECT * FROM food_attribute_type"
food_nutrient = "SELECT * FROM food_nutrient"
food_update_log_entry = "SELECT * FROM food_update_log_entry"
measure_unit = "SELECT * FROM measure_unit"
microbe = "SELECT * FROM microbe"
nutrient = "SELECT * FROM nutrient"
nutrient_incoming_name = "SELECT * FROM nutrient_incoming_name"

try:
    conn = psycopg2.connect(**db_config)

    branded_food = pd.read_sql_query(branded_food, conn)
    food = pd.read_sql_query(food, conn)
    food_attribute = pd.read_sql_query(food_attribute, conn)
    food_attribute_type = pd.read_sql_query(food_attribute_type, conn)
    food_nutrient = pd.read_sql_query(food_nutrient, conn)
    food_update_log_entry = pd.read_sql_query(food_update_log_entry, conn)
    measure_unit = pd.read_sql_query(measure_unit, conn)
    microbe = pd.read_sql_query(microbe, conn)
    nutrient = pd.read_sql_query(nutrient, conn)
    nutrient_incoming_name = pd.read_sql_query(nutrient_incoming_name, conn)

    conn.close()

except Exception as e:
    print(f"An error occurred: {e}")

In [5]:
data_food = food.merge(branded_food, on='fdc_id')[['fdc_id', 'description', 'ingredients', 'branded_food_category']]
food_nutrients_combined = food_nutrient.merge(nutrient, left_on='nutrient_id', right_on='id')[['fdc_id', 'nutrient_id', 'name']]
data = food_nutrients_combined.merge(data_food, on='fdc_id')

In [6]:
def preprocess_data(data, data_count):
    data = data[:data_count]
    data['nutrients'] = data.groupby('fdc_id')['name'].transform(lambda x : ' '.join(x)) 
    data['nutrients'] = data['nutrients'].transform(lambda x : np.unique(x.split(' '))) 
    data['nutrients'] = data['nutrients'].transform(lambda x : (" ".join(x)).replace(",", " ").replace("(", " ").replace(")", " ").replace("+", " "))    

    data = data.drop(['nutrient_id', 'name'], axis=1)
    data = data.drop_duplicates()
    return data

In [7]:
def text_processing(data):
    data['description'] = data['description'].transform(lambda x: x.lower())
    data['ingredients'] = data['ingredients'].transform(lambda x: x.lower())
    data['nutrients'] = data['nutrients'].transform(lambda x: x.lower())
    data['food_category'] = data['branded_food_category'].transform(lambda x: x.lower())

    data['description'] = data['description'].transform(lambda x: re.sub(r'[^\w\s]', '', x))
    data['ingredients'] = data['ingredients'].transform(lambda x: re.sub(r'[^\w\s]', '', x))
    data['nutrients'] = data['nutrients'].transform(lambda x: re.sub(r'[^\w\s]', '', x))
    data['food_category'] = data['food_category'].transform(lambda x: re.sub(r'[^\w\s]', '', x))

    #nltk.download('punkt')

    stemmer = PorterStemmer()
    
    def stem_string(text):
        tokens = word_tokenize(text.lower())
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return ' '.join(stemmed_tokens)
        
    data['description'] = data['description'].apply(stem_string)
    data['ingredients'] = data['ingredients'].apply(stem_string)
    data['nutrients'] = data['nutrients'].apply(stem_string)

    data = data.reset_index()
    data = data.drop('index', axis=1)
    return data

In [8]:
def feature_engineering(data):
    le = LabelEncoder()
    data['food_category_encoded'] = le.fit_transform(data['food_category'])

    vectorizer = TfidfVectorizer(stop_words='english', max_features=100, min_df=0.01, max_df=0.9)  # Removing common stop words

    tfidf_matrix_description = vectorizer.fit_transform(data['description'])  # Apply TF-IDF to the 'text' column
    
    tfidf_df_description = pd.DataFrame(
        tfidf_matrix_description.toarray(), 
        columns=vectorizer.get_feature_names_out()
    )
    
    tfidf_matrix_ingredients = vectorizer.fit_transform(data['ingredients'])  # Apply TF-IDF to the 'text' column
    
    tfidf_df_ingredients = pd.DataFrame(
        tfidf_matrix_ingredients.toarray(), 
        columns=vectorizer.get_feature_names_out()
    )
    
    tfidf_matrix_nutrients = vectorizer.fit_transform(data['nutrients'])  # Apply TF-IDF to the 'text' column
    
    tfidf_df_nutrients = pd.DataFrame(
        tfidf_matrix_nutrients.toarray(), 
        columns=vectorizer.get_feature_names_out()
    )

    df = pd.DataFrame()

    df = pd.concat([tfidf_df_description, tfidf_df_ingredients, tfidf_df_nutrients], axis=1)
    
    df['category'] = data['food_category_encoded']
    return df

In [14]:
def insert_into_db(predictions, data, accuracy):
    connection = psycopg2.connect(**db_config)

    cursor = connection.cursor()
    
    for i, j in zip(predictions['y_test'], predictions['y_pred']):
        insert_query = """
                        INSERT INTO predictions (model_name, accuracy, actual_value, predicted_value)
                        VALUES (%s, %s, %s, %s);
                        """
        cursor.execute(insert_query, ("RandomForestClassifier", accuracy, data[data['food_category_encoded'] == i]['food_category'].values[0], 
                                      data[data['food_category_encoded'] == j]['food_category'].values[0]))

    connection.commit()
    cursor.close()
    connection.close()

In [16]:
def training_model_metrics(df, data):
    X = df.drop('category', axis=1)
    y = df['category']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='macro'),
        'f1_score': f1_score(y_test, y_pred, average='weighted'),
    }

    predictions = {"y_test": y_test, "y_pred": y_pred}
    predictions = pd.DataFrame(predictions)
    insert_into_db(predictions, data, metrics['accuracy'])
    return metrics

In [18]:
def insert_into_metrics(metrics, size):
    connection = psycopg2.connect(**db_config)
    cursor = connection.cursor()
    
    # Example metrics to insert
    metrics_final = [
        {"metric_name": "Accuracy", "metric_value": round(metrics['accuracy'], 2)},
        {"metric_name": "Precision", "metric_value": round(metrics['precision'], 2)},
        {"metric_name": "F1 Score", "metric_value": round(metrics['f1_score'], 2)},
    ]
    
    # Insert metrics into the table
    for metric in metrics_final:
        query = """
            INSERT INTO performance_metrics (model_name, metric_name, metric_value, data_count)
            VALUES (%s, %s, %s, %s)
        """
        cursor.execute(query, ("RandomForestClassifier", metric["metric_name"], metric["metric_value"], size))
    
    # Commit the changes and close the connection
    connection.commit()
    cursor.close()
    connection.close()

In [20]:
def run_model(data, data_count):
    data = preprocess_data(data, data_count)
    data = text_processing(data)
    df = feature_engineering(data)
    metrics = training_model_metrics(df, data)
    insert_into_metrics(metrics, df.shape[0])

In [None]:
run_model(data, 15000000)