In [1]:
import numpy as np
import pandas as pd
from __future__ import unicode_literals
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold 
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from hazm import *

In [2]:
train = pd.read_csv('datasets/train.csv')

print("train shape: {}".format(train.shape))
train.head(10)

train shape: (3048, 3)


Unnamed: 0,id,query,label
0,0,شرایط حذف ترم چیه؟,1
1,1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2
2,2,بوفه برداران تا ساعت چند باز است؟,2
3,3,کمترین تعداد واحد چند عدد است؟,1
4,4,سنگ جامد است,5
5,5,سرورای دانشکده مشکل دارن؟,3
6,6,کلاس آزمایشگاه فیزیک در دانشکده خودمان برگزار ...,2
7,7,شرایط حذف پزشکی چیه؟,1
8,8,در شرایطی ساعت و روز کلاسی جابجا می شود؟,1
9,9,سطل آشغال در کلاس 101 وجود ندارد.,4


In [3]:
class Preprocess:
    stop_words=['تر','و','در','به','از','که','این','را','با','است','برای','آن','یک','خود','تا','کرد','بر','هم','نیز','گفت','وی','شد','دارد','ما','اما','یا','شده','باید','هر','آنها','بود','او','یگر','دو','مورد','شود','کند','وجود','بین','پیش','شده_است','پس','نظر','اگر','همه','یکی','حال','هستند','من','کنند','نیست','باشد','چه','بی','می','بخش','همین','افزود','هایی','دارند','راه','همچنین','روی','داد','بیشتر','بسیار','سه','داشت','چند','سوی','تنها','هیچ','میان','اینکه','شدن','بعد','جدید','ولی','حتی','کردن','برخی','کردند','اول','نه','کرده_است','نسبت','بیش','شما','چنین','طور','افراد','تمام','درباره','بار','بسیاری','کرده','چون','ندارد','دوم','بزرگ','طی','حدود','همان','بدون','البته','آنان','دیگری','خواهد_شد','کنیم','قابل','یعنی','رشد','وارد','کل','ویژه','قبل','براساس','نیاز','گذاری','هنوز','لازم','سازی','بوده_است','چرا','وقتی','گرفت','کم','جای','حالی','تغییر','پیدا','اکنون','تحت','باعث','مدت','فقط','زیادی','تعداد','آیا','بیان','رو','شدند','عدم','کرده_اند','بودن','نوع','بلکه','جاری','دهد','برابر','مهم','بوده','اخیر','مربوط','امر','زیر','گیری','شاید','خصوص','آقای','اثر','کننده','بودند','فکر','کنار','اولین','سوم','سایر','کنید','ضمن','مانند','باز','ممکن','حل','دارای','پی','مثل','اجرا','دور','منظور','کسی','موجب','طول','امکان','آنچه','تعیین','گفته','شوند','جمع','خیلی','علاوه','گونه','تاکنون','رسید','ساله','گرفته','شده_اند','علت','چهار','داشته_باشد','خواهد_بود','طرف','تهیه','تبدیل','مناسب','زیرا','مشخص','نزدیک','جریان','روند','بنابراین','یافت','نخستین','بالا','پنج','ریزی','عالی','چیزی','نخست','بیشتری','ترتیب','شده_بود','خاص','خوبی','خوب','شروع','فرد','کامل','غیر','دهند','آخرین','دادن','جدی','بهترین','شامل','گیرد','بخشی','باشند','تمامی','بهتر','داده_است','حد','نبود','کسانی','داریم','علیه','دانست','ناشی','داشتند','دهه','ایشان','آنجا','گرفته_است','دچار','لحاظ','آنکه','داده','بعضی','هستیم','اند','برداری','نباید','نشست','سهم','همیشه','آمد','اش','وگو','حداقل','طبق','جا','خواهد_کرد','نوعی','چگونه','رفت','هنگام','فوق','روش','ندارند','سعی','بندی','شمار','کلی','کافی','مواجه','همچنان','زیاد','سمت','کوچک','داشته_است','چیز','پشت','آورد','حالا','روبه','دادند','عهده','نیمه','جایی','دیگران','سی','بروز','یکدیگر','آمده_است','جز','کنم','سپس','کنندگان','خودش','چیه','چیست','همواره','یافته','شان','صرف','رسیدن','چهارم','یابد','متر','ساز','داشته','کرده_بود','باره','نحوه','کردم','تو','شخصی','داشته_باشند','محسوب','پخش','کمی','متفاوت','سراسر','کاملا','داشتن','نظیر','آمده','گروهی','فردی','ع','همچون','خطر','خویش','کدام','دسته','سبب','عین','آوری','متاسفانه','بیرون','دار','ابتدا','شش','افرادی','سالهای','درون','نیستند','یافته_است','تو','هام','پر','خاطرنشان','گاه','','جمعی','اغلب','دوباره','لذا','زاده','فر','گردد','اینجا']
    normalizer = Normalizer()
    tokenizer = WordTokenizer(join_verb_parts=False)
    informal_normalizer = InformalNormalizer()
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    
    def remove_special_chars(self,text):
        chars=['0','1','2','3','4','5','6','7','8','9','"',"'",'=','@','&','%','.',',',':','\\','$','^','<','>','!','؟','،','.',':','!','?','{','}',';','\n','\t','(',')','[',']','/','*','+','#','\u200c','\ufeff','-','_','|','۱','۲','۳','۴','۵','۶','۷','۸','۹','۰','،']
        for item in chars:
            text=text.replace(item,"")
        return text 
    
    def clean_text(self, text):
        cleaned_text = self.normalizer.punctuation_spacing(text)
        cleaned_text = self.normalizer.affix_spacing(cleaned_text)
        cleaned_text = self.normalizer.character_refinement(cleaned_text)
        cleaned_text = self.remove_special_chars(cleaned_text)
        cleaned_text = self.normalizer.normalize(cleaned_text)
        
        
        return cleaned_text

    def make_tokens(self, text):
        tokenized = [self.lemmatizer.lemmatize(token)
                    for token in self.tokenizer.tokenize(text) 
                    if token not in self.stop_words]
        return tokenized


# Random Forest Classifier

The following are the basic steps involved in performing the random forest algorithm:

1.Pick N random records from the dataset.

2.Build a decision tree based on these N records.

3.Choose the number of trees you want in your algorithm and repeat steps 1 and 2.

4.The final value can be calculated by taking the average of all the values predicted by all the trees in forest. Or, in case of a classification problem, each tree in the forest predicts the category to which the new record belongs. Finally, the new record is assigned to the category that wins the majority vote.

### Advantages of using Random Forest
As with any algorithm, there are advantages and disadvantages to using it.

1.The random forest algorithm is not biased, since, there are multiple trees and each tree is trained on a subset of data. Basically, the random forest algorithm relies on the power of "the crowd"; therefore the overall biasedness of the algorithm is reduced.

2.This algorithm is very stable. Even if a new data point is introduced in the dataset the overall algorithm is not affected much since new data may impact one tree, but it is very hard for it to impact all the trees.

3.The random forest algorithm works well when you have both categorical and numerical features.

4.The random forest algorithm also works well when data has missing values or it has not been scaled well

## Finding TFIDF
The bag of words approach works fine for converting text to numbers. However, it has one drawback. It assigns a score to a word based on its occurrence in a particular document. It doesn't take into account the fact that the word might also be having a high frequency of occurrence in other documents as well. TFIDF resolves this issue by multiplying the **term frequency of a word** by the **inverse document frequency**. The TF stands for "Term Frequency" while IDF stands for "Inverse Document Frequency".

#### Term frequency is calculated as:

Term frequency = (Number of Occurrences of a word)/(Total words in the document)

#### Inverse Document Frequency is calculated as:

IDF(word) = Log((Total number of documents)/(Number of documents containing the word))

In [4]:
preprocess = Preprocess()
train['query'] = train.apply(lambda x: preprocess.clean_text(x['query']) , axis = 1)

def k_fold_validation(k, data):
    
    kfold = KFold(n_splits=k, random_state=0)
    score = []
 
    for train_index , test_index in kfold.split(train[['query']]):
    
        X_train , X_test = train['query'].iloc[train_index] , train['query'].iloc[test_index]
        y_train , y_test = train['label'].iloc[train_index] , train['label'].iloc[test_index]
    
        vectorizer = TfidfVectorizer(min_df= 3, stop_words=preprocess.stop_words, sublinear_tf=True)
        features = vectorizer.fit_transform(X_train)
        
        classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
        classifier.fit(features, y_train) 

        y_pred = classifier.predict(vectorizer.transform(X_test))
        score.append(precision_recall_fscore_support(y_pred , y_test, average='weighted'))
   
    avg_prec_score = sum([ i[0] for i in score])/k 
    avg_recall_score = sum([ i[1] for i in score])/k 
    avg_fscore_score = sum([ i[2] for i in score])/k 
    return avg_prec_score, avg_recall_score, avg_fscore_score

## Random Forest Classifier

In [5]:
avg_prec_score, avg_recall_score, avg_fscore_score = k_fold_validation(3,train)
print('Avg precision : {}'.format(avg_prec_score))
print('Avg recall : {}'.format(avg_recall_score))
print('Avg fscore : {}'.format(avg_fscore_score))



Avg precision : 0.7121474777896198
Avg recall : 0.7119422572178477
Avg fscore : 0.7091010343052379


# Comparing Random Forest and Naive Bayes Classifier
Random Forest Classifier gained less metrics scores in comparison to Naive Bayes classifier.
Actually it sounds that porecessing on persian language has an impact on this difference because in naive bayes algorithm some processes had been done on our data wich it causes to have sutable data for training. Althought Random Forest Classifier is a good choice for this project but as it works with lots of random selected parts for making decision trees it may cause to have a litle more faults in predicting the test data. On the other hand naive bayes classifier works based on Bayes' Theorem and calculate a probability for containing to each class.


## Logistic Regression Classifier
Logistic regression is a regression model. The model builds a regression model to predict the probability that a given data entry belongs to the category numbered as “1”. Just like Linear regression assumes that the data follows a linear function

In [6]:
from sklearn.linear_model import LogisticRegression
def k_fold_validation_logistic(k, data):
    
    kfold = KFold(n_splits=k, random_state=None)
    score = []
 
    for train_index , test_index in kfold.split(train[['query']]):
    
        X_train , X_test = train['query'].iloc[train_index] , train['query'].iloc[test_index]
        y_train , y_test = train['label'].iloc[train_index] , train['label'].iloc[test_index]
    
        vectorizer = TfidfVectorizer(min_df= 3, stop_words=preprocess.stop_words, sublinear_tf=True, norm='l2')
        features = vectorizer.fit_transform(X_train)
        
        clf = LogisticRegression(random_state=0).fit(features, y_train)
        y_pred = clf.predict(vectorizer.transform(X_test))
        
        score.append(precision_recall_fscore_support(y_pred , y_test, average='weighted'))
   
    avg_prec_score = sum([ i[0] for i in score])/k 
    avg_recall_score = sum([ i[1] for i in score])/k 
    avg_fscore_score = sum([ i[2] for i in score])/k 
                             
    return avg_prec_score, avg_recall_score, avg_fscore_score
                             

                             
                             
avg_prec_score, avg_recall_score, avg_fscore_score = k_fold_validation_logistic(3,train)
print('Avg precision : {}'.format(avg_prec_score))
print('Avg recall : {}'.format(avg_recall_score))
print('Avg fscore : {}'.format(avg_fscore_score))                            

Avg precision : 0.7420202344563722
Avg recall : 0.7355643044619423
Avg fscore : 0.7369400561302211
