# Imports

In [None]:
import sqlite3
import csv
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/sjsu-aiml-project/Code

/content/drive/MyDrive/sjsu-aiml-project/Code


# Search

### Read in recipe database

In [None]:
conn = sqlite3.connect('/content/drive/MyDrive/sjsu-aiml-project/Code/database/recipes.sqlite')
df = pd.read_sql_query("SELECT * from recipes", conn)

# Verify that result of SQL query is stored in the dataframe
print(df.head())

conn.close()

   index                  title  \
0      0    No-Bake Nut Cookies   
1      1  Jewell Ball'S Chicken   
2      2            Creamy Corn   
3      3          Chicken Funny   
4      4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  
0  ["In a heavy 2-quart saucepan, mix brown sugar...  
1  ["Place chipped beef on bottom of baking dish....  
2  ["In a slow cooker, combine all ingredients. C...  
3  ["Boil and debone chicken.", "Put bite size pi...  
4  ["Combine first four ingredients and press in ...  


In [None]:
# place recipe titles into a list and begin
recipeTitles = []
cleanRecipeTitles = []

for index, row in df.iterrows():
  recipeTitles.append(row.title)
#print(recipeTitles)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# instantiate vectorizer
# any settings that you use for count vectorizer will go here 
tfidf_vectorizer = TfidfVectorizer()

# converts docs to tfidf vectors 
tfidf_vectors = tfidf_vectorizer.fit_transform(recipeTitles)

# convert matrix to a regular array 
tf_idf_array = tfidf_vectors.toarray()

# obtain the original terms in the corpus
words_set = tfidf_vectorizer.get_feature_names_out() 

# create a dataframe to better show the TF-IDF scores of each document (row)
tfidf_df = pd.DataFrame(tf_idf_array, columns = words_set)
#print(tfidf_df['bake'])

In [None]:
input = 'burr'
word_tokenizer = tfidf_vectorizer.build_tokenizer()
input_tokens = word_tokenizer(input.lower())

# Get sum of DataFrame rows (which is a doc/recipe)
# for input tokens that are represented as columns
sumRows = tfidf_df[input_tokens].sum(axis=1)

# sort dataframe to get closest match row/doc/recipe name
topScore = sumRows.sort_values(ascending=False)

print('TFIDF says this is the recipe: ', recipeTitles[topScore.index[0]])

TFIDF says this is the recipe:  Burr, It's Chili!


# Testing

### Loop on search string

In [None]:
# place recipe titles into a list and begin
# cleaning process by converting titles to lower-case
recipeTitles = []
true_positives = 0
false_positives = 0
false_negatives = 0
precision = 0
recall = 0

# read in database for ground truth
conn = sqlite3.connect('/content/drive/MyDrive/sjsu-aiml-project/Code/database/recipes.sqlite')
recipeDatabase = pd.read_sql_query("SELECT * from recipes", conn)

# build list of recipe titles to serve as reference truth
for index, row in recipeDatabase.iterrows():
  # build list of recipe titles to serve as reference truth
  recipeTitles.append(row.title.lower())

# main loop to generate calculations
for index, row in recipeDatabase.iterrows():
  
  # assign and clean search string
  search_string = row.title.lower()
  query_tokens = word_tokenizer(search_string)
  
  # Get sum of DataFrame rows (which is a doc/recipe)
  # for input tokens that are represented as columns
  sumRows = tfidf_df[query_tokens].sum(axis=1)

  # sort dataframe to get closest match row/doc/recipe name
  topScore = sumRows.sort_values(ascending=False)
  #print(topScore.index[0])
  # determine if tf-idf search yields a true positive
  # get first value (highest tf-idf score) in dictionary
  try:
    top_search_result_returned = recipeTitles[topScore.index[0]]
  except:
    false_negatives += 1
    break
  
  # compare top result with query entry
  # if query is not top tf-idf result, false positive
  # and false negative are incremented
  if top_search_result_returned == search_string:
    true_positives += 1
  else:
    false_positives += 1
    false_negatives += 1

# Precision = TruePositives / (TruePositives + FalsePositives)
precision = true_positives / (true_positives + false_positives)

# Recall = TruePositives / (TruePositives + FalseNegatives)
recall = true_positives / (true_positives + false_negatives)

# F1 Score = 2 * (Precision * Recall) / (Precision + Recall)
f1_score = 2 * ((precision * recall) / (precision + recall))

print('F1 Score: ', f1_score)

F1 Score:  0.8
