In [34]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from parsivar import Normalizer, Tokenizer
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load

# data = pd.read_csv("LDPSA.csv")
data = pd.read_csv("Snappfood.csv")

with open('Pesian_Stop_Words_List.txt', 'r', encoding='utf-8') as file:
    stop_words = [line.strip() for line in file]


In [6]:
def preprocess(texts):
    processed_texts = []

    normalizer = Normalizer(pinglish_conversion_needed=True)
    tokenizer = Tokenizer()

    for text in texts:
        normalized_text = normalizer.normalize(text)

        sentences = tokenizer.tokenize_sentences(normalized_text)

        filtered_sentences = []
        for sentence in sentences:
            words = sentence.split()
            filtered_words = [word for word in words if word not in stop_words]
            filtered_sentence = ' '.join(filtered_words)
            filtered_sentences.append(filtered_sentence)

        final_text = ' '.join(filtered_sentences)
        processed_texts.append(final_text)

    return processed_texts

In [None]:
comments = preprocess(data['comment'])

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(comments)
y = data['label_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train)

dump(vectorizer, 'vectorizer.joblib')
dump(model, 'model.joblib')

['vectorizer.joblib']

In [33]:
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')

new_text = ['پیشنهاد نمیکنم']  

new_text_processed = preprocess(new_text)

new_text_vector = vectorizer.transform(new_text_processed)

predict = model.predict_proba(new_text_vector)

positive = round(predict[0][0], 2)
negative = round(predict[0][1], 2)

print(f'Prediction for "{new_text[0]}": \n Negative: {negative} \n Positive: {positive}')

Accuracy: 0.8325
Prediction for "پیشنهاد نمیکنم": 
 Negative: 0.73 
 Positive: 0.27
