In [6]:
# Install non-existing libraries
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
# Import necessary libraries
from google.colab import files
import pandas as pd
from sklearn.model_selection import train_test_split
from hazm import Normalizer, word_tokenize
import numpy as np
import math
from collections import Counter
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score
from itertools import combinations

In [8]:
# Upload kaggle api from local machine to Google Colab
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"tohidabdi98","key":"f1a9f6b27a958d9cd3ea2b7476464cca"}'}

In [9]:
# Authentication credentials for accessing the Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [10]:
# Download the dataset
!kaggle datasets download -d soheiltehranipour/snappfood-persian-sentiment-analysis
!unzip /content/snappfood-persian-sentiment-analysis.zip

Downloading snappfood-persian-sentiment-analysis.zip to /content
  0% 0.00/3.16M [00:00<?, ?B/s]
100% 3.16M/3.16M [00:00<00:00, 135MB/s]
Archive:  /content/snappfood-persian-sentiment-analysis.zip
  inflating: Snappfood - Sentiment Analysis.csv  


In [None]:
# Read csv file
df = pd.read_csv('/content/Snappfood - Sentiment Analysis.csv', sep=None, engine='python')

In [12]:
# Calculate the required number of rows for each class
num_rows_class_sad = int(0.2 * df['label_id'].value_counts()[1])
num_rows_class_happy = int(0.2 * df['label_id'].value_counts()[0])

# Select the required number of rows from each class
df_sampled = df.groupby('label_id').apply(lambda x: x.sample(n=num_rows_class_sad if x['label_id'].iloc[0] == 1 else num_rows_class_happy))

# Concatenate the selected rows into a new dataframe
df_selected = pd.concat([df_sampled.loc[1], df_sampled.loc[0]])

In [13]:
# Create a normalizer object from the hazm library
normalizer = Normalizer()

# Define a function to normalize and tokenize Persian strings
def normalize_and_tokenize(text):
    # Normalize the text
    normalized_text = normalizer.normalize(text)
    
    # Tokenize the normalized text
    tokens = word_tokenize(normalized_text)
    
    # Return the tokens as a string
    return tokens

# Apply the function to the 'text' column of the dataframe
df_selected['normalized_tokens'] = df_selected['comment'].apply(normalize_and_tokenize)

# Display the results
df_selected.head()


Unnamed: 0.1,Unnamed: 0,comment,label,label_id,normalized_tokens
25097,,متاسفانه در ساندویچ کاهوی خراب پیدا شد. لطفا ک...,SAD,1.0,"[متاسفانه, در, ساندویچ, کاهوی, خراب, پیدا, شد,..."
17956,,دیدم یه نفر نوشته بود که شیرینی‌های مونده می‌ف...,SAD,1.0,"[دیدم, یه, نفر, نوشته_بود, که, شیرینی‌های, مون..."
59404,,درسته که غذاتون کیفیتش خوبه اما متاسفانه سرد ب...,SAD,1.0,"[درسته, که, غذاتون, کیفیتش, خوبه, اما, متاسفان..."
52589,,خوشمزه بودش ولی سس خیلی زیادش باعث می‌شه مزه‌ی...,SAD,1.0,"[خوشمزه, بودش, ولی, سس, خیلی, زیادش, باعث, می‌..."
54736,,دوبار قبلا این غذا رو مادرم گرفته بود که خیلی ...,SAD,1.0,"[دوبار, قبلا, این, غذا, رو, مادرم, گرفته_بود, ..."


In [14]:
# Split the DataFrame into train and test sets, using 10% for testing
train_df, test_df = train_test_split(df_selected, test_size=0.1)

In [15]:
def compute_tf_idf(docs):
    N = len(docs)
    vocab = set()
    for row in train_df['normalized_tokens']:
        vocab.update(row)
    vocab = list(vocab)
    tf_idf = np.zeros((N, len(vocab)))
    for doc_id, doc in enumerate(docs):
        word_count = Counter(doc)
        total_words = len(doc)
        for word, count in word_count.items():
            # Compute term frequency (TF)
            tf = count / total_words
            
            # Compute inverse document frequency (IDF)
            doc_count = sum(1 for d in docs if word in d)
            idf = math.log(N / doc_count)
            
            # Compute TF-IDF score
            score = tf * idf

            # Store TF-IDF score in matrice
            try:
                index = vocab.index(word)
                tf_idf[doc_id][index] = score
            except:
                pass
    
    return tf_idf

In [16]:
tf_idf_train = compute_tf_idf(train_df['normalized_tokens'])

In [17]:
tf_idf_test = compute_tf_idf(test_df['normalized_tokens'])

In [18]:
# Fit Gaussian naive bayes classifier
classifier_GNB = GaussianNB()
classifier_GNB.fit(tf_idf_train, train_df['label_id'])

# Predict test labels with model
y_pred_GNB = classifier_GNB.predict(tf_idf_test)

# Evaluation
recall_GNB = recall_score(test_df['label_id'], y_pred_GNB)
precision_GNB = precision_score(test_df['label_id'], y_pred_GNB)
f1_GNB = f1_score(test_df['label_id'], y_pred_GNB)
print('Recall = ' + str(recall_GNB) + '\nPrecision = ' + str(precision_GNB) + '\nF1 = ' + str(f1_GNB))

Recall = 0.3805436337625179
Precision = 0.6683417085427136
F1 = 0.48495897903372825


In [19]:
# Fit Multinomial naive bayes classifier
MNBclassifier = MultinomialNB()
MNBclassifier.fit(tf_idf_train, train_df['label_id'])

# Predict test labels with model
y_pred_MNB = MNBclassifier.predict(tf_idf_test)

# Evaluation
recall_MNB = recall_score(test_df['label_id'], y_pred_MNB)
precision_MNB = precision_score(test_df['label_id'], y_pred_MNB)
f1_MNB = f1_score(test_df['label_id'], y_pred_MNB)
print('Recall = ' + str(recall_MNB) + '\nPrecision = ' + str(precision_MNB) + '\nF1 = ' + str(f1_MNB))

Recall = 0.894134477825465
Precision = 0.7783312577833126
F1 = 0.8322237017310252


In [20]:
def compute_ppmi(docs):

    # Step 1: Compute vocabulary
    vocab = set()
    for row in train_df['normalized_tokens']:
        vocab.update(row)
    
    # Step 1: Count word occurrences
    word_counts = Counter()
    for row in train_df['normalized_tokens']:
        word_counts.update(row)

    # Step 2: Create co-occurrence matrix
    co_occurrence = np.zeros((len(vocab), len(vocab)))
    for row in docs:
        for word1, word2 in combinations(row, 2):
            try:
                idx1 = list(vocab).index(word1)
                idx2 = list(vocab).index(word2)
                co_occurrence[idx1, idx2] += 1
                co_occurrence[idx2, idx1] += 1
            except:
                pass

    # Step 3: Calculate PPMI
    n = np.sum(co_occurrence)
    ppmi = np.zeros((len(vocab), len(vocab)))
    for i in range(len(vocab)):
        for j in range(i+1, len(vocab)):
            if co_occurrence[i,j] == 0:
                continue
            pmi = np.log2((co_occurrence[i,j] * n) / (word_counts[list(vocab)[i]] * word_counts[list(vocab)[j]]))
            ppmi[i,j] = max(pmi, 0)
            ppmi[j,i] = max(pmi, 0)
    
    return ppmi, vocab

In [21]:
ppmi_coocurrence_matrix, vocab = compute_ppmi(train_df['normalized_tokens'])

In [22]:
# Transform the training data
X_train_transformed = []
for sentence in train_df['comment']:
    words = sentence.split()
    vector = np.zeros(len(vocab))
    for i, word in enumerate(words):
        if word in vocab:
            vector[list(vocab).index(word)] += 1
    X_train_transformed.append(ppmi_coocurrence_matrix.dot(vector))
ppmi_train = np.array(X_train_transformed)

# Transform the testing data
X_test_transformed = []
for sentence in test_df['comment']:
    words = sentence.split()
    vector = np.zeros(len(vocab))
    for i, word in enumerate(words):
        if word in vocab:
            vector[list(vocab).index(word)] += 1
    X_test_transformed.append(ppmi_coocurrence_matrix.dot(vector))
ppmi_test = np.array(X_test_transformed)


In [23]:
# Fit Gaussian naive bayes classifier
classifier_GNB = GaussianNB()
classifier_GNB.fit(ppmi_train, train_df['label_id'])

# Predict test labels with model
y_pred_GNB = classifier_GNB.predict(ppmi_test)

# Evaluation
recall_GNB = recall_score(test_df['label_id'], y_pred_GNB)
precision_GNB = precision_score(test_df['label_id'], y_pred_GNB)
f1_GNB = f1_score(test_df['label_id'], y_pred_GNB)
print('Recall = ' + str(recall_GNB) + '\nPrecision = ' + str(precision_GNB) + '\nF1 = ' + str(f1_GNB))

Recall = 0.2832618025751073
Precision = 0.5981873111782477
F1 = 0.3844660194174757


In [24]:
# Fit Multinomial naive bayes classifier
MNBclassifier = MultinomialNB()
MNBclassifier.fit(ppmi_train, train_df['label_id'])

# Predict test labels with model
y_pred_MNB = MNBclassifier.predict(ppmi_test)

# Evaluation
recall_MNB = recall_score(test_df['label_id'], y_pred_MNB)
precision_MNB = precision_score(test_df['label_id'], y_pred_MNB)
f1_MNB = f1_score(test_df['label_id'], y_pred_MNB)
print('Recall = ' + str(recall_MNB) + '\nPrecision = ' + str(precision_MNB) + '\nF1 = ' + str(f1_MNB))

Recall = 0.7253218884120172
Precision = 0.7201704545454546
F1 = 0.7227369921596578
