In [32]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

In [85]:
buying_keywords = ["ဝယ်", "စျေး", "အော်ဒါ", "တင်", "ကြည့်", "ယူမယ်"]
not_buying_keywords = ["မဝယ်", "မလို", "မသေချာ", "စဉ်းစား"]

In [86]:
data = [
    ("ဒီဟာဝယ်မလား?", "buy"),
    ("ငါဈေးဝယ်ချင်တယ်", "buy"),
    ("ဒီမှာဘယ်လိုဝယ်ရမလဲ?", "buy"),
    ("ဈေးနှုန်းဘယ်လောက်လဲ?", "buy"),
    ("ဝယ်မယ်", "buy"),
    ("ငါဒီထုတ်ကုန်ဝယ်ချင်တယ်", "buy"),
    ("ငါဒီကို ဝယ်လိုက်ပြီ", "buy"),
    ("ငါအော်ဒါတင်လိုက်ပြီ", "buy"),
    
    ("မဝယ်သေးဘူး", "not buy"),
    ("စဉ်းစားမနေသေးဘူး", "not buy"),
    ("မလိုအပ်သေးဘူး", "not buy"),
    ("ဈေးကွက်ကြည့်နေတုန်းပဲ", "not buy"),
    ("မသေချာသေးဘူး", "not buy"),
    ("ဒီတစ်ခါမဝယ်သေးဘူး", "not buy"),
    ("ငါဝယ်ချင်တာမဟုတ်ဘူး", "not buy"),
    ("ဒီဟာမဝယ်ဘူး", "not buy"),
]

In [58]:
import re

In [87]:
def extract_keywords(text):
    words = re.findall(r'\w+', text)  # Split text into words
    return " ".join([word for word in words if word in buying_keywords or word in not_buying_keywords or len(word) > 2])  


In [88]:
texts, labels = zip(*data)

In [89]:
texts = [extract_keywords(text) for text in texts] 

In [90]:
print(texts)

['', '', 'ရမလ', '', '', '', '', '', 'မဝယ', '', '', '', '', 'မဝယ', '', 'မဝယ']


In [91]:
label_map = {"buy": 1, "not buy": 0}
y = np.array([label_map[label] for label in labels])

In [68]:
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(texts)

In [49]:
label_map = {"buy": 1, "not buy": 0}
y = np.array([label_map[label] for label in labels])

In [92]:
from sklearn.naive_bayes import MultinomialNB

In [99]:
# vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words=None)  # Remove stop_words=None
vectorizer = TfidfVectorizer(ngram_range=(1, 2) , strip_accents="unicode" , stop_words = None)  # Uses bigrams (two-word phrases)
X = vectorizer.fit_transform(texts)

classifier = MultinomialNB()
classifier.fit(X, y)

In [100]:
# Train a simple classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [101]:
def predict_intent(user_input):
    user_input_processed = extract_keywords(user_input)  # Keep only relevant words
    user_input_vectorized = vectorizer.transform([user_input_processed])
    prediction = classifier.predict(user_input_vectorized)[0]
    return "Interested in Buying 🛒" if prediction == 1 else "Not Interested ❌"

In [102]:
test_sentences = [
    "ငါဝယ်မယ်",  # Expected: Interested
    "ဒီပစ္စည်းစျေးဘယ်လောက်လဲ?",  # Expected: Interested
    "မလိုအပ်သေးဘူး",  # Expected: Not Interested
    "ငါစဉ်းစားမနေသေးဘူး",  # Expected: Not Interested
]

In [103]:
for sentence in test_sentences:
    print(f"User Input: {sentence} → Prediction: {predict_intent(sentence)}")

User Input: ငါဝယ်မယ် → Prediction: Not Interested ❌
User Input: ဒီပစ္စည်းစျေးဘယ်လောက်လဲ? → Prediction: Not Interested ❌
User Input: မလိုအပ်သေးဘူး → Prediction: Not Interested ❌
User Input: ငါစဉ်းစားမနေသေးဘူး → Prediction: Not Interested ❌


In [105]:
predictions = classifier.predict(X)
print("Predicted Labels:", predictions)
print("Actual Labels:", y)


Predicted Labels: [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
Actual Labels: [1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]


{'is', 'as', 'ours', 'all', "aren't", 's', 'shan', 'being', 'above', 'before', 'both', 'my', 'on', 'our', 'how', 'does', 'isn', 'were', "needn't", 'when', 'wouldn', 'most', 'm', 'or', "that'll", 'hasn', 'if', 'now', 'has', "mightn't", "they'd", 'until', 'so', 'why', 'off', 'just', 'be', 'whom', 'this', "isn't", 'here', 'was', 'at', 'o', 'she', 'in', "mustn't", 'under', 'can', 'over', "you're", "we've", 'very', "shan't", "wouldn't", 'between', 'have', 'against', 'after', 'up', 'me', 'into', 'to', 'an', 'you', 'those', "we'd", 'needn', 'didn', 'had', "we'll", 'do', 'out', 'down', "he'll", 'mustn', 'hers', 'won', 'doing', 'herself', "it's", 'i', 'for', 've', 'there', 'too', 'himself', "i'm", 'same', 'each', 'of', 'no', 'wasn', 'hadn', "i've", 'few', 'and', 'because', "doesn't", "they'll", 'yourself', "i'd", "didn't", 'they', 'from', 'doesn', 'only', 'by', "he'd", 'her', "he's", "it'd", 'him', "won't", 'the', 'its', 'ma', 'we', 'mightn', 'ain', 'about', 'ourselves', 'theirs', 'having', "th

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/yewinnaing/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/yewinnaing/nltk_data...


True

['queen', 'camilla', 'crown', 'alongsid', 'huge', 'parad', 'back', 'buckingham', 'palac']


['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [25]:
# tag each word with part of speech
# pos_tag(words)

In [26]:
"""
POS

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

"""

'\nPOS\n\nCC: It is the conjunction of coordinating\nCD: It is a digit of cardinal\nDT: It is the determiner\nEX: Existential\nFW: It is a foreign word\nIN: Preposition and conjunction\nJJ: Adjective\nJJR and JJS: Adjective and superlative\nLS: List marker\nMD: Modal\nNN: Singular noun\nNNS, NNP, NNPS: Proper and plural noun\nPDT: Predeterminer\nWRB: Adverb of wh\nWP$: Possessive wh\nWP: Pronoun of wh\nWDT: Determiner of wp\nVBZ: Verb\nVBP, VBN, VBG, VBD, VB: Forms of verbs\nUH: Interjection\nTO: To go\nRP: Particle\nRBS, RB, RBR: Adverb\nPRP, PRP$: Pronoun personal and professional\n\n'