# STEP 2: Build Classifier

In [1]:
from googletrans import Translator
import re, codecs, nltk, pickle
# nltk.download()
# Notice the pop-up window --> collections (tab) --> popular
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

In [2]:
# translate hebrew sentence to english
def translate_sentence(s):
    translator = Translator()
    translated_message = translator.translate(s)
    return translated_message.text

In [3]:
def clean_chat(chat):
    clean_chat = []
    for message in chat:
        # Remove non-letters        
        letters_only = re.sub("[^א-ת]", " ", message) 
        #
        clean_s = " ".join(letters_only.split())
        trans_s = translate_sentence(clean_s)
        
        # Convert to lower case, split into individual words
        words = trans_s.lower().split()   
        #
        # In Python, searching a set is much faster than searching
        #   a list, so convert the stop words to a set
        stops = set(stopwords.words("english"))                  
        # 
        # steeming using PorterStemmer
        porter = nltk.PorterStemmer()
        after_stemmnig = [porter.stem(w) for w in words]
        # Remove stop words
        meaningful_words = [w for w in after_stemmnig if not w in stops]   
        #
        # Join the words back into one string separated by space, 
        # and return the result.
        clean_chat.append( " ".join(meaningful_words)) 
    return clean_chat

## read messages from step 1

In [20]:
# read chat as message list from file
man_chats = []
women_chats = []

with codecs.open("man.txt", encoding='utf-8') as fp:   # Unpickling
    man_chats = fp.read().splitlines() 
with codecs.open("woman.txt", encoding='utf-8') as fp:   # Unpickling
    women_chats = fp.read().splitlines() 

In [21]:
print ('length of man chats = {}, length of woman chats = {}'.format(len(man_chats),len(women_chats)))

length of man chats = 18643, length of woman chats = 5418


In [22]:
# take the minimum lebgth
number_of_sentence = min(len(man_chats),len(women_chats))

## Select random sentences

In [23]:
import random

l = number_of_sentence
slim_women_chats = women_chats[:l]

while len(man_chats)>number_of_sentence:
  man_chats.remove(random.choice(list(man_chats)))
slim_man_chats = man_chats

In [24]:
print ('length of man chats = {}, length of woman chats = {}'.format(len(slim_man_chats),len(slim_women_chats)))

length of man chats = 5418, length of woman chats = 5418


## clean and translate chats

In [None]:
# women
clean_message_list_women = clean_chat(slim_women_chats)
# save the list
with open('translated_woman.pkl', 'wb') as fid:
    pickle.dump(clean_message_list_women, fid) 

In [26]:
# man
clean_message_list_man = clean_chat(slim_man_chats)
# save the list
with open('translated_man.pkl', 'wb') as fid:
    pickle.dump(clean_message_list_man, fid)   

In [34]:
clean_message_list = clean_message_list_women + clean_message_list_man

## create classification DF ('man' or 'women')

In [35]:
classification = []
for _ in range(l):
    classification.append('women')
for _ in range(l):
    classification.append('man')
class_df = pd.DataFrame({'Gender' : np.array(classification)})

In [36]:
def createBOW(clean_message_list):
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 1000)
    train_data_features = vectorizer.fit_transform(clean_message_list)
    
    
    voc = vectorizer.get_feature_names()
    # save the dictionary for future use
    with open('voc.pkl', 'wb') as fid:
        pickle.dump(voc, fid)  
        
    return train_data_features.toarray()

## create BOW

In [37]:
train_data_features = createBOW(clean_message_list)

## split to train & test

In [38]:
ran = np.random.rand(len(train_data_features)) < 0.8
train_sequence = train_data_features[ran]
test_sequence = train_data_features[~ran]
train_class = class_df.loc[ran, 'Gender']
test_class = class_df.loc[~ran, 'Gender']

## Naive Base
The Naive Bayes Classifier technique is based on the so-called Bayesian theorem and is particularly suited when the dimensionality of the inputs is high. Despite its simplicity, Naive Bayes can often outperform more sophisticated classification methods.

In [39]:
from sklearn.naive_bayes import GaussianNB

# Initialize a Naive Bayes classifier.
nb = GaussianNB()

# Fit the Naive Bayes to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
nb = nb.fit( train_sequence, train_class )

# Evaluate accuracy best on the test set
naive_bayes = nb.score(test_sequence,test_class)

naive_bayes

0.6708576560395151

## SVC
Support Vector Machines are based on the concept of decision planes that define decision boundaries. A decision plane is one that separates between a set of objects having different class memberships.

In [40]:
from sklearn import svm

# Initialize a svc classifier with gamma = 0.001, C = 100, degree = 3
svc = svm.SVC(gamma = 0.001, C = 100, degree = 3)

# Fit the svc to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
svc = svc.fit( train_sequence, train_class )

# Evaluate accuracy best on the test set
support_vector = svc.score(test_sequence,test_class)

support_vector

0.7503367759317467

## compare between to models
as we see SVC give us better score so we decide to take this model to step 4

## save the trained model for step 4

In [41]:
# save the classifier
with open('classifier.pkl', 'wb') as fid:
    pickle.dump(svc, fid)    