In [30]:
# Initialize Mistral client with API key
from mistralai import Mistral
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.environ["MISTRAL_KEY"] # Replace with your API key
client = Mistral(api_key=api_key)

# Import required libraries
from pathlib import Path
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
import json

# Verify PDF file exists
import base64
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
# Verify image exists
for i in os.listdir("/Users/eliot/Desktop/project_ML/dataset/receipts/"):
    time.sleep(1)
    image_file = Path(f"/Users/eliot/Desktop/project_ML/dataset/receipts/{i}")
    assert image_file.is_file()

    # Encode image as base64 for API
    encoded = base64.b64encode(image_file.read_bytes()).decode()
    base64_data_url = f"data:image/jpeg;base64,{encoded}"

    image_response = client.ocr.process(
        document=ImageURLChunk(image_url=base64_data_url),
        model="mistral-ocr-latest"
    )

    # Convert response to JSON
    response_dict = json.loads(image_response.model_dump_json())
    json_string = json.dumps(response_dict, indent=4)

    image_ocr_markdown = response_dict["pages"][0]["markdown"]
    # Get structured response from model
    chat_response = client.chat.complete(
        model="pixtral-12b-latest",
        messages=[
            {
                "role": "user",
                "content": [
                    ImageURLChunk(image_url=base64_data_url),
                    TextChunk(
                        text=(
                            f"This is image's OCR in markdown:\n\n{image_ocr_markdown}\n.\n"
                            "I wan't same structured json with only 'name' restaurant final 'total' and 'date', and the 'adress'"
                            "The output should be strictly be json with no extra commentary"
                            
                        )
                    ),
                ],
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    # Parse and return JSON response
    response_dict = json.loads(chat_response.choices[0].message.content)

    print(response_dict)
    # Remove rows with NaN in 'vendor' column from the DataFrame itself
    combined_df = combined_df.dropna(subset=["vendor"])
    col1  = response_dict["name"] +' '+ response_dict["address"]  # your target string
    # Then proceed with your code
    embeddings_col1 = col1
    embeddings_col2 = combined_df["vendor"].tolist()  # list of vendors (cleaned)

    # Create a combined list for TF-IDF (target first, then all vendors)
    all_texts = [embeddings_col1] + embeddings_col2

    # Vectorize
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Compute cosine similarity between restaurant_address and all vendors
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Put the results in a DataFrame
    similarity_df = pd.DataFrame({
        "vendor": embeddings_col2,
        "similarity": similarities,
        "amount":response_dict["total"]
    }).sort_values(by="similarity", ascending=False).head(5)

    print(combined_df.merge(similarity_df, on=["amount", "vendor"], how="inner"))

    # Sort by similarity

    # Display top matches




{'name': 'Tay Ho Oakland', 'address': 'Elegant Vietnamese Cuisine', 'total': 14.18, 'date': '2/19/19'}
         date  amount currency                                      vendor  \
0  2019-02-19   14.18      USD  Tay Ho Oakland, Elegant Vietnamese Cuisine   

   similarity  
0         1.0  
{'name': 'Pita Pita - Lombard', 'address': '211 E Roosevelt Rd Lombard, IL 60148', 'total': 228.92, 'date': '5/4/2017'}
         date  amount currency  \
0  2017-05-04  228.92      USD   

                                              vendor  similarity  
0  Pita Pita - Lombard, 211 E Roosevelt Rd, Lomba...         1.0  


SDKError: API error occurred: Status 429
{"message":"Requests rate limit exceeded"}

In [112]:
combined_df

Unnamed: 0,date,amount,currency,vendor
0,2016-09-01,7.61,USD,"7230 Pendleton Pike, Indianapolis, IN 46226"
1,2018-07-21,25.78,USD,"889 W. 190th Street, Los Angeles, CA 90248"
2,2014-12-21,25.94,USD,"Hialeah, FL 33010"
3,2017-05-12,45.58,USD,Ortega Pizzeria & Tavern
4,2016-11-12,117.00,USD,JBR
...,...,...,...,...
195,,24.47,USD,Dona Mercedes Restaurant
196,,9.58,USD,"11732 ARTESIA BLVD. ARTESIA, CA."
197,,40.30,USD,"18128 Pioneer Blvd, Artesia, CA 90701"
198,,10.23,USD,"Golden Coin, 94-450 Ukee St, Waipahu, HI 96797"


In [27]:

import os
import pandas as pd

folder_path = "/Users/eliot/Desktop/project_ML/dataset/bank_statements"
all_dfs = []

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):  # You can change this for other file types
        full_path = os.path.join(folder_path, filename)
        df = pd.read_csv(full_path)
        all_dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(all_dfs, ignore_index=True)





In [107]:
combined_df = combined_df.dropna(subset=["vendor"])
col1  = response_dict["name"] 
#+' '+ response_dict["address"] 

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Remove rows with NaN in 'vendor' column from the DataFrame itself
combined_df = combined_df.dropna(subset=["vendor"])
col1  = response_dict["name"] +' '+ response_dict["address"]  # your target string
# Then proceed with your code
embeddings_col1 = col1
embeddings_col2 = combined_df["vendor"].tolist()  # list of vendors (cleaned)

# Create a combined list for TF-IDF (target first, then all vendors)
all_texts = [embeddings_col1] + embeddings_col2

# Vectorize
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Compute cosine similarity between restaurant_address and all vendors
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

# Put the results in a DataFrame
similarity_df = pd.DataFrame({
    "vendor": embeddings_col2,
    "similarity": similarities,
    "amount":response_dict["total"]
}).sort_values(by="similarity", ascending=False).head(5)

combined_df.merge(similarity_df, on=["amount", "vendor"], how="inner")

# Sort by similarity

# Display top matches


Unnamed: 0,date,amount,currency,vendor,similarity
0,2015-04-25,69.25,USD,"GRAND LUX CAFE, Roosevelt Field",1.0


Unnamed: 0,date,amount,currency,vendor,similarity
0,2016-05-26,56.58,USD,"5305 E PACIFIC COAST HWY, Long Beach, CA 90804",0.923419
