In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
src_path = '/content/drive/MyDrive/Data.json'  # Update with your source path

In [7]:
import json
import pandas as pd

# Load the JSON data from a file
file_path = src_path  # Replace with your actual file path

with open(file_path, 'r') as f:
    json_data = json.load(f)

# Convert to a pandas DataFrame
df = pd.DataFrame(json_data)

# Display the DataFrame
print("DataFrame:")
print(df)

# Convert 'expiresAt' to datetime for further analysis
df['expiresAt'] = pd.to_datetime(df['expiresAt'])

# Convert 'weight' to numeric (removing the 'g')
#df['weight'] = df['weight'].str.replace('g', '').astype(float)

# Descriptive statistics
print("\nSummary Statistics:")
print(df.describe())

# Checking for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Products with less than 50 available units
print("\nProducts with less than 50 available units:")
print(df[df['available'] < 50])

# Items sorted by expiration date
print("\nItems sorted by expiration date:")
print(df.sort_values(by='expiresAt'))

# Items sorted by price
print("\nItems sorted by price:")
print(df.sort_values(by='price'))


DataFrame:
          id                         name   expiresAt  price weight  \
0    8376291      Organic Cherry Tomatoes  2024-09-23   2.99   250g   
1    5728364               Sweet Potatoes  2024-10-15   1.79    1kg   
2    9126483             Broccoli Florets  2024-09-18   2.49   400g   
3    4537281             Red Bell Peppers  2024-12-02   1.29   500g   
4    1928374                 Baby Spinach  2024-09-27   3.99   150g   
..       ...                          ...         ...    ...    ...   
348  1728391       Mung Beans, Whole, 1kg  2025-01-14   4.99    1kg   
349  6283742      Fava Beans, Dried, 250g  2024-11-09   2.29   250g   
350  3847292  Adzuki Beans, Organic, 500g  2025-04-22   3.79   500g   
351  5928372        Soybeans, Yellow, 1kg  2024-09-29   3.29    1kg   
352  8374627    White Beans, Canned, 400g  2025-06-17   1.19   400g   

    packagingUnit  available  
0          punnet         48  
1             bag         23  
2             bag         17  
3           

In [11]:
from datetime import datetime, timedelta
import numpy as np

In [13]:

# Get today's date
today = pd.to_datetime(datetime.now().date())

# Calculate expiration window (items expiring in the next 10 days)
expiration_threshold = today + timedelta(days=15)

# Add a discount field initialized to 0
df['discount'] = 0

# Apply a random discount between 10 and 40 percent to items expiring in the next 10 days
df.loc[df['expiresAt'] <= expiration_threshold, 'discount'] = np.random.randint(10, 41, df[df['expiresAt'] <= expiration_threshold].shape[0])

# Display the updated DataFrame with discounts
print("DataFrame with Discount:")
print(df)

DataFrame with Discount:
          id                         name  expiresAt  price weight  \
0    8376291      Organic Cherry Tomatoes 2024-09-23   2.99   250g   
1    5728364               Sweet Potatoes 2024-10-15   1.79    1kg   
2    9126483             Broccoli Florets 2024-09-18   2.49   400g   
3    4537281             Red Bell Peppers 2024-12-02   1.29   500g   
4    1928374                 Baby Spinach 2024-09-27   3.99   150g   
..       ...                          ...        ...    ...    ...   
348  1728391       Mung Beans, Whole, 1kg 2025-01-14   4.99    1kg   
349  6283742      Fava Beans, Dried, 250g 2024-11-09   2.29   250g   
350  3847292  Adzuki Beans, Organic, 500g 2025-04-22   3.79   500g   
351  5928372        Soybeans, Yellow, 1kg 2024-09-29   3.29    1kg   
352  8374627    White Beans, Canned, 400g 2025-06-17   1.19   400g   

    packagingUnit  available  discount  
0          punnet         48         0  
1             bag         23         0  
2          

In [14]:
pip install transformers torch




In [16]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the pre-trained BERT model for sequence classification
model_name = 'bert-base-uncased'  # You can choose other models from Hugging Face like distilbert-base-uncased
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)  # Adjust num_labels based on your categories


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Example product names
product_names = df["name"].tolist()

# Tokenize the input text using BERT tokenizer
inputs = tokenizer(product_names, padding=True, truncation=True, return_tensors="pt")


In [27]:
# Perform inference to get logits (raw predictions)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get predicted category by taking the index of the highest logit value
predicted_categories = torch.argmax(logits, dim=1)

In [25]:
# Define category mapping
category_mapping = {0: 'Vegetables', 1: 'Bakery', 2: 'Dairy', 3: 'Meat', 4: 'Beverages'}

# Map predicted indices to category labels
predicted_labels = [category_mapping[idx.item()] for idx in predicted_categories]

# Add predicted categories to the DataFrame
df['predicted_category'] = predicted_labels

# Display the DataFrame with predicted categories
print(df)


          id                         name  expiresAt  price weight  \
0    8376291      Organic Cherry Tomatoes 2024-09-23   2.99   250g   
1    5728364               Sweet Potatoes 2024-10-15   1.79    1kg   
2    9126483             Broccoli Florets 2024-09-18   2.49   400g   
3    4537281             Red Bell Peppers 2024-12-02   1.29   500g   
4    1928374                 Baby Spinach 2024-09-27   3.99   150g   
..       ...                          ...        ...    ...    ...   
348  1728391       Mung Beans, Whole, 1kg 2025-01-14   4.99    1kg   
349  6283742      Fava Beans, Dried, 250g 2024-11-09   2.29   250g   
350  3847292  Adzuki Beans, Organic, 500g 2025-04-22   3.79   500g   
351  5928372        Soybeans, Yellow, 1kg 2024-09-29   3.29    1kg   
352  8374627    White Beans, Canned, 400g 2025-06-17   1.19   400g   

    packagingUnit  available  discount predicted_category  
0          punnet         48         0          Beverages  
1             bag         23         0 

In [29]:
# Lowercase product names
df['name'] = df['name'].str.lower()

In [32]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Tokenize product names
inputs = tokenizer(df['name'].tolist(), padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Get mean of all token embeddings

# Use embeddings for clustering
from sklearn.cluster import KMeans

num_clusters = 5  # Adjust this based on the number of categories you want
kmeans = KMeans(n_clusters=num_clusters)
df['predicted_category'] = kmeans.fit_predict(embeddings.numpy())


  super()._check_params_vs_input(X, default_n_init=10)


In [33]:
print(df)

          id                         name  expiresAt  price weight  \
0    8376291      organic cherry tomatoes 2024-09-23   2.99   250g   
1    5728364               sweet potatoes 2024-10-15   1.79    1kg   
2    9126483             broccoli florets 2024-09-18   2.49   400g   
3    4537281             red bell peppers 2024-12-02   1.29   500g   
4    1928374                 baby spinach 2024-09-27   3.99   150g   
..       ...                          ...        ...    ...    ...   
348  1728391       mung beans, whole, 1kg 2025-01-14   4.99    1kg   
349  6283742      fava beans, dried, 250g 2024-11-09   2.29   250g   
350  3847292  adzuki beans, organic, 500g 2025-04-22   3.79   500g   
351  5928372        soybeans, yellow, 1kg 2024-09-29   3.29    1kg   
352  8374627    white beans, canned, 400g 2025-06-17   1.19   400g   

    packagingUnit  available  discount  predicted_category  
0          punnet         48         0                   0  
1             bag         23         

In [8]:
import pandas as pd
import random
import numpy as np

# Define a list of cities with their geo-locations (latitude, longitude)
locations = [
    {"city": "New York", "lat": 40.7128, "lon": -74.0060},
    {"city": "Los Angeles", "lat": 34.0522, "lon": -118.2437},
    {"city": "London", "lat": 51.5074, "lon": -0.1278},
    {"city": "Paris", "lat": 48.8566, "lon": 2.3522},
    {"city": "Tokyo", "lat": 35.6895, "lon": 139.6917},
    {"city": "Sydney", "lat": -33.8688, "lon": 151.2093},
    {"city": "Berlin", "lat": 52.5200, "lon": 13.4050},
    {"city": "Toronto", "lat": 43.651070, "lon": -79.347015},
    {"city": "Chicago", "lat": 41.8781, "lon": -87.6298},
    {"city": "San Francisco", "lat": 37.7749, "lon": -122.4194}
]

# Define a list of sample products
products = [
    "Organic Cherry Tomatoes", "Whole Wheat Bread", "Cheddar Cheese", "Ground Beef",
    "Fresh Spinach", "Greek Yogurt", "Almond Milk", "Chicken Breast",
    "Orange Juice", "Bananas", "Avocados", "Basmati Rice", "Eggs", "Butter"
]

# Define a list of random expiration dates
expiration_dates = pd.date_range(start="2024-09-01", end="2024-12-01", periods=10).strftime('%Y-%m-%d')

# Generate sample data for multiple locations
def generate_sample_data(num_entries=100):
    data = []

    for _ in range(num_entries):
        # Randomly pick a product, location, and expiration date
        product = random.choice(products)
        location = random.choice(locations)
        expiresAt = random.choice(expiration_dates)

        # Random price between $1.00 and $20.00
        price = round(random.uniform(1.00, 20.00), 2)

        # Random discount between 0% and 50%
        discount = round(random.uniform(0, 0.5), 2)

        # Random availability between 0 and 100 units
        available = random.randint(0, 100)

        # Create the product entry
        entry = {
            'id': random.randint(1000000, 9999999),  # Random product ID
            'name': product,
            'price': price,
            'discount': discount,  # Discount percentage
            'expiresAt': expiresAt,
            'available': available,
            'store_city': location['city'],
            'store_lat': location['lat'],
            'store_lon': location['lon']
        }

        data.append(entry)

    return pd.DataFrame(data)

# Generate 100 sample entries for multiple store locations
df = generate_sample_data(100)

# Show the first 10 rows of the generated data
print(df.head(10))

# Save the DataFrame to a CSV file (optional)
df.to_csv('sample_products_with_locations.csv', index=False)


        id                     name  price  discount   expiresAt  available  \
0  5442269                 Avocados   9.07      0.17  2024-10-31         53   
1  7013351              Almond Milk  16.71      0.04  2024-10-21         54   
2  3100540                     Eggs  16.95      0.19  2024-09-01         14   
3  1657817  Organic Cherry Tomatoes  10.93      0.48  2024-10-01         91   
4  6717444             Greek Yogurt  18.41      0.25  2024-11-20         38   
5  2820971             Orange Juice  16.45      0.04  2024-11-20         60   
6  3655739             Orange Juice  18.29      0.21  2024-09-21         82   
7  8725581        Whole Wheat Bread  11.04      0.43  2024-10-11         28   
8  7122560             Orange Juice  10.48      0.09  2024-09-21         95   
9  4938535             Greek Yogurt   3.52      0.47  2024-10-21         73   

      store_city  store_lat  store_lon  
0         Sydney   -33.8688   151.2093  
1    Los Angeles    34.0522  -118.2437  
2      

In [9]:
# Create a DataFrame for products
#df = pd.DataFrame(data)

# Function to filter products with discounts and their geo-location
def get_discounted_products_by_location(lat, lon, radius_km=50):
    # Filter products that have a discount
    discounted_products = df[df['discount'] > 0]

    # Define a simple distance check (you can use haversine formula for more accurate distance)
    def is_within_radius(store_lat, store_lon, user_lat, user_lon, radius_km):
        # Calculate approximate distance using latitude and longitude
        distance = ((store_lat - user_lat)**2 + (store_lon - user_lon)**2)**0.5
        # Convert to kilometers (this is a basic distance calculation)
        return distance * 111 <= radius_km  # 111 km per degree approx

    # Filter products based on location (within the given radius)
    nearby_products = discounted_products[
        discounted_products.apply(lambda row: is_within_radius(row['store_lat'], row['store_lon'], lat, lon, radius_km), axis=1)
    ]

    return nearby_products

# Example: Publishing the data of products having discount based on the store's location
user_location = (40.730610, -73.935242)  # Example user geo-location (NYC coordinates)
discounted_products = get_discounted_products_by_location(user_location[0], user_location[1])

# Publish the discounted products in JSON format
def publish_discounted_products(products_df):
    result = products_df[['name', 'price', 'discount', 'expiresAt', 'store_lat', 'store_lon']].to_json(orient='records')
    print("Publishing discounted products data: ")
    print(result)

# Publish data for the discounted products nearby the user's location
publish_discounted_products(discounted_products)

Publishing discounted products data: 
[{"name":"Whole Wheat Bread","price":1.75,"discount":0.27,"expiresAt":"2024-10-21","store_lat":40.7128,"store_lon":-74.006},{"name":"Eggs","price":6.5,"discount":0.26,"expiresAt":"2024-10-31","store_lat":40.7128,"store_lon":-74.006},{"name":"Bananas","price":4.24,"discount":0.22,"expiresAt":"2024-09-11","store_lat":40.7128,"store_lon":-74.006},{"name":"Chicken Breast","price":15.58,"discount":0.25,"expiresAt":"2024-10-21","store_lat":40.7128,"store_lon":-74.006},{"name":"Whole Wheat Bread","price":7.72,"discount":0.1,"expiresAt":"2024-09-21","store_lat":40.7128,"store_lon":-74.006},{"name":"Fresh Spinach","price":1.8,"discount":0.34,"expiresAt":"2024-12-01","store_lat":40.7128,"store_lon":-74.006},{"name":"Chicken Breast","price":18.88,"discount":0.3,"expiresAt":"2024-09-21","store_lat":40.7128,"store_lon":-74.006},{"name":"Ground Beef","price":14.36,"discount":0.24,"expiresAt":"2024-09-01","store_lat":40.7128,"store_lon":-74.006},{"name":"Ground B

In [11]:
import random
import pandas as pd

# Sample fun facts related to some products
product_facts = {
    "Organic Cherry Tomatoes": "Tomatoes were once considered poisonous in Europe!",
    "Whole Wheat Bread": "Whole wheat bread contains all parts of the grain, making it more nutritious!",
    "Cheddar Cheese": "Cheddar cheese originated in the English village of Cheddar, Somerset.",
    "Ground Beef": "Ground beef is the most versatile meat used in dishes like hamburgers and meatballs.",
    "Fresh Spinach": "Spinach is known for its high iron content, made famous by Popeye!",
    "Greek Yogurt": "Greek yogurt is thicker and creamier than regular yogurt because of its straining process.",
    "Almond Milk": "Almond milk is one of the oldest plant-based milks, dating back to medieval times.",
    "Chicken Breast": "Chicken breast is one of the leanest sources of protein!",
    "Orange Juice": "Orange juice is packed with vitamin C and was once a luxury in many parts of the world.",
    "Bananas": "Bananas are berries, but strawberries are not!",
    "Avocados": "Avocados are rich in healthy fats and are often referred to as a superfood.",
    "Basmati Rice": "Basmati rice has been grown in the foothills of the Himalayas for thousands of years.",
    "Eggs": "Eggs are a complete protein source containing all nine essential amino acids.",
    "Butter": "Butter is one of the oldest dairy products and was once used as currency!"
}



# Function to get discounted products
def get_discounted_products():
    return df[df['discount'] > 0]

# Select a random product with a discount and generate a fun fact
def publish_fun_fact_about_discounted_product():
    # Get discounted products
    discounted_products = get_discounted_products()

    # Choose a random product from the discounted list
    if not discounted_products.empty:
        selected_product = discounted_products.sample(1).iloc[0]
        product_name = selected_product['name']

        # Get the fun fact for the selected product (if available)
        fun_fact = product_facts.get(product_name, "This product doesn't have a fun fact yet, but it's amazing!")

        # Prepare the message to publish
        message = f"🛒 **Discounted Product of the Day**: {product_name}\n" \
                  f"💸 Price: ${selected_product['price']} (Discount: {int(selected_product['discount'] * 100)}%)\n" \
                  f"🎉 Fun Fact: {fun_fact}"

        # Simulate publishing the message (you can replace this with an API call, website update, etc.)
        print(message)
    else:
        print("No discounted products available today.")

# Run the function to publish a fun fact about a discounted product
publish_fun_fact_about_discounted_product()


🛒 **Discounted Product of the Day**: Bananas
💸 Price: $9.2 (Discount: 39%)
🎉 Fun Fact: Bananas are berries, but strawberries are not!
