In [1]:
import pandas
import string
from hazm import word_tokenize, stopwords_list
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from warnings import filterwarnings

filterwarnings('ignore')

# Import dataset and print its 5 first rows:

In [2]:
data = pandas.read_csv('divar_posts_dataset.csv', index_col=0)
data.head()

Unnamed: 0,archive_by_user,brand,cat1,cat2,cat3,city,created_at,desc,id,image_count,mileage,platform,price,title,type,year
0,False,,for-the-home,furniture-and-home-decore,sofa-armchair,Tehran,Tuesday 07PM,کلاسیک و شیک و استثنایی\nچرم مالزی\nچوب راش\nف...,54761640000000.0,5.0,,mobile,3850000.0,ست مبلمان و نهارخوری ٩ نفره,,
1,False,,for-the-home,furniture-and-home-decore,antiques-and-art,Mashhad,Tuesday 07PM,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",42727630000000.0,1.0,,mobile,30000.0,گلدون مصنوعی نخل,,
2,False,,vehicles,cars,heavy,Mashhad,Tuesday 07PM,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,63194440000000.0,4.0,,mobile,-1.0,لودر کاتر پیلار 950,,
3,False,,for-the-home,furniture-and-home-decore,sofa-armchair,Tehran,Tuesday 07PM,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,19133030000000.0,4.0,,mobile,600000.0,مبل راحتی هفت نفره بامیز جلو مبلی,,
4,False,,personal,baby-and-toys,personal-toys,Karaj,Tuesday 08PM,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,58999000000000.0,2.0,,mobile,450000.0,ماشین شارژی,,


# Concatenate "cat" columns for using in classifier algorithm

In [3]:
data['cat1'] = data['cat1'].fillna('na')
data['cat2'] = data['cat2'].fillna('na')
data['cat3'] = data['cat3'].fillna('na')
data['desc'] = data['desc'].fillna('')
data['title'] = data['title'].fillna('')
data['cats'] = data.cat1 + '_' + data.cat2 + '_' + data.cat3

# A function for preprocessing persian text
Usually people won't insert spaces between a word and a digit, or between a Persian word and a English one.
This function fixes this cases in our text.

In [4]:
def separator(elem, start=1):
    latin = string.ascii_lowercase + string.digits + '$'
    for ind in range(start, len(elem)):
        if elem[ind] == ' ':
            continue
        a = elem[ind-1] in latin
        b = elem[ind] in latin
        if a != b:
            return separator(elem[:ind] + ' ' + elem[ind:], ind + 2)
    return elem

# Vectorizing the text
Now, we vectorize the text with count vectorizer described in slides. We set the size of the vocabulary to 110K (a manually tuned number).

In [5]:
vectorizer = CountVectorizer(preprocessor=separator,
                             tokenizer=word_tokenize, stop_words=stopwords_list(),
                             max_features=110000)
counts = vectorizer.fit_transform(data.title.values)

# Classifying using naive Bayes classifier
We'll use cross validation to measure the accuracy of our model. Lidstone/Laplace smoothing parameter set to 0.06 (should be manually tuned :D).

In [6]:
classifier = MultinomialNB(alpha=0.06, fit_prior=False)
targets = data.cats.values
scores = cross_val_score(classifier, counts, targets, cv=5, n_jobs=-1)
print('Average accuracy of the model using 5-fold cross validaion: %f' % scores.mean())

Average accuracy of the model using 5-fold cross validaion: 0.808474
