In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
import pickle

In [2]:
# Load the labeled dataset
df = pd.read_csv('example_prices.csv')

In [3]:
# Split the dataset into a training set and a test set
train_size = int(len(df) * 0.8)
df_train = df[:train_size]
df_test = df[train_size:]


In [4]:
# Vectorize the input text using bag-of-words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(df_train['phrase'])
X_test = vectorizer.transform(df_test['phrase'])

In [5]:
# Train a Naive Bayes classifier on the training set
# y_train = df_train['is_price']
# model = MultinomialNB()
# model.fit(X_train, y_train)

filename = "price_extract.sav"
model = pickle.load(open(filename, 'rb'))
# Save the model to a file
# pickle.dump(model, open(filename, 'wb'))
model


In [6]:
# Evaluate the model on the test set
y_test = df_test['is_price']
accuracy = model.score(X_test, y_test)
print(f'Test accuracy: {accuracy:.2f}')

Test accuracy: 0.95


In [7]:
# Preprocess the text to extract the phrases to classify
# Function to extract price-related phrases from text using regular expressions
def extract_prices(text):
    price_patterns = [r'rs\.?\s*\.?\d+', r'\$\d+', r'₹\d+', r'at\s*\d+', r'for\s*\d+', r'from\s*\d+', r'@\s*\d+', r'loot\s*\d+']
    prices = []
    for pattern in price_patterns:
        prices += re.findall(pattern, text)
    return prices

# Example text
text = '''Talc Free Baby Powder + Moisturizing Cream at 102.'''
# Extract the price-related phrases
final_text = text.replace(',', '').lower()
phrases = extract_prices(final_text)
phrases

['at 102']

In [8]:
# Vectorize the extracted phrases using bag-of-words
X = vectorizer.transform(phrases)

In [9]:
# Use the model to classify the extracted phrases
predictions = model.predict(X)
predictions

array([1], dtype=int64)

In [10]:
# Extract and convert the prices
prices = []
for phrase, label in zip(phrases, predictions):
    if label == 1:
        # Extract the numerical value using a regular expression
        value = re.search(r'\d+(?:[.,]\d+)?', phrase)
        if value:
            # Convert the value to a common currency (e.g. USD) using a currency conversion API
            price = value.group()  # TODO: use currency conversion API
            prices.append(price)
print(f'Extracted and converted prices: {prices}')

Extracted and converted prices: ['102']
