In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

### 1) Data Preparation 

In [2]:
# Read the Semeval data and return a dataframe
def read_semeval_data(filename):
    '''
    Description: reads Semantic Evaluation XML dataset and converts into a 
                 dataframe
    Arguments:
                 filename: string with file path (including filename)
    Returns :    pandas data frame
    Important:   this function only puts positive and neutral reviews in dataframe                          
    '''
    f = open(filename, 'r')
    raw_training_data = f.read()
    xmldoc = BeautifulSoup(raw_training_data,'lxml-xml')
    sentences = xmldoc.Reviews.find_all('sentences')
    opinions = xmldoc.Reviews.find_all('Opinions')
    reviews = []
    for i in range(0,len(sentences)):
        record = {}
        entity_aspect_pairs = opinions[i].find_all('Opinion')
        for ea_pair in entity_aspect_pairs:
            ea = ea_pair.attrs['category']
            polarity = ea_pair.attrs['polarity']
            if(polarity == 'positive'):
                record[ea] = 1
            elif(polarity == 'negative'):
                record[ea] = -1
            else:
                record[ea] = 0           
        record['TEXT'] = sentences[i].get_text()
        reviews.append(record)
    #Create a dataframe
    df=pd.DataFrame(reviews)
    #Change order of the columns so that text appears first
    cols = df.columns.tolist()
    cols.sort()
    cols.reverse()
    df = df[cols]
    df.fillna(0, inplace=True)
    return df

entity_labels = ['Food','Drinks','Service','Ambience','Location','Restaurant']
<br/>attributes_labels = ['General','Prices','Quality','Style&Options','Miscellaneous']

##### Possible Combinations of Entities and Attributes #####

<img src='images/entity_attributes_combinations.jpg' style='width:50;height:50'>

In [3]:
df_training = read_semeval_data('data/train.xml')
df_testing = read_semeval_data('data/test.xml')
df_testing.head()

Unnamed: 0,TEXT,SERVICE#GENERAL,RESTAURANT#PRICES,RESTAURANT#MISCELLANEOUS,RESTAURANT#GENERAL,LOCATION#GENERAL,FOOD#STYLE_OPTIONS,FOOD#QUALITY,FOOD#PRICES,DRINKS#STYLE_OPTIONS,DRINKS#QUALITY,DRINKS#PRICES,AMBIENCE#GENERAL
0,\n\nYum!\n\n\nServes really good sushi.\n\n\nN...,0.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,\n\nNo Comparison\n\n\n– I can't say enough ab...,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,\n\nSnotty Attitude\n\n\n– We were treated ver...,-1.0,0.0,0.0,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,\n\nGood food!\n\n\n– We love breakfast food.\...,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,\n\nOverrated\n\n\n– I was highly disappointed...,0.0,-1.0,0.0,-1,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0


In the dataframe above a +1 indicates a positive polarity and -1 indicates negative polarity whereas 0 means that this aspect is not found or found to be neutral

In [4]:
#Function to clean the text data
#Remove punctuations, newline characters and convert to lowercase.
#Note that we are not removing dot to mark sentence boundary
def clean_text_data(data):
    '''
    Description: Given text returns cleaned version
    Arguments:
                  data: string with raw review text
    Returns  :
                  cleaned: string with unwanted characters removed
    '''
    prog = re.compile('[\t\n\r\f\v\d\']', re.UNICODE)
    data = re.sub(prog, ' ', data).lower()
    prog = re.compile('[!\"#$%&\'()*+\,-/:;<=>?@[\]^_`{|}~]', re.UNICODE)
    cleaned = re.sub(prog, ' ', data)
    return cleaned

Note that we are not removing the stopwords. One reason is that often builtin stopwords lists for English language contain the word 'no', 'nor','not' etc. If removed it can change the sentiment e.g 'Food is not good' and 'Food is good' both will become 'Food good'. It is therefore decided not to remove stopwords.

In [5]:
#Function to decode and print output labels and polarity for a review
def output_to_labels(output):
    '''
     Description: Converts predicted output for a review into labels and polarity
     Arguments: output a numpy array
                
    '''
    labels = ['SERVICE#GENERAL', 'RESTAURANT#PRICES',
       'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#GENERAL', 'LOCATION#GENERAL',
       'FOOD#STYLE_OPTIONS', 'FOOD#QUALITY', 'FOOD#PRICES',
       'DRINKS#STYLE_OPTIONS', 'DRINKS#QUALITY', 'DRINKS#PRICES',
       'AMBIENCE#GENERAL']
    for index in range(len(labels)):
        value = output[0,index]
        if(value == 1):
            print(labels[index],':',' ','positive')
        elif(value == -1):
            print(labels[index],':',' ','negative')
    

In [6]:
df_training['TEXT'] = df_training['TEXT'].apply(clean_text_data)
df_testing['TEXT'] = df_testing['TEXT'].apply(clean_text_data)

In [7]:
df_testing.head()

Unnamed: 0,TEXT,SERVICE#GENERAL,RESTAURANT#PRICES,RESTAURANT#MISCELLANEOUS,RESTAURANT#GENERAL,LOCATION#GENERAL,FOOD#STYLE_OPTIONS,FOOD#QUALITY,FOOD#PRICES,DRINKS#STYLE_OPTIONS,DRINKS#QUALITY,DRINKS#PRICES,AMBIENCE#GENERAL
0,yum serves really good sushi not the b...,0.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,no comparison – i can t say enough about t...,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,snotty attitude – we were treated very rud...,-1.0,0.0,0.0,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,good food – we love breakfast food thi...,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,overrated – i was highly disappointed in t...,0.0,-1.0,0.0,-1,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0


In [208]:
#Function to do Feature Scaling
def standardize_features(X_train,X_test, standardize=True):
    """
    Returns standardized features
    :param X_train: Training data to be standardized
    :param X_test : Testing data to be standardized
    :param standardize : A flag to indicate if we need data standardized
    :return: X_train_std,X_test_std: Standardized training and testing data
    """
    standardizer = StandardScaler()
    X_train_std = X_train
    X_test_std = X_test

    if(standardize):
        X_train_std = standardizer.fit_transform(X_train)
        X_test_std = standardizer.transform(X_test)

    return X_train_std,X_test_std, standardizer

In [9]:
#Prepare training and testing datasets
X_train = df_training['TEXT']
y_train = df_training.drop('TEXT',axis=1)

X_test = df_testing['TEXT']
y_test = df_testing.drop('TEXT',axis=1)

### 2) Evaluation metrics for Multilabel Multiclass Classifiers ###

In order to measure the performance of a multilabel multiclass classifier we can use F1 score. F1 score is the weighted average of precision and recall. For multilabel multioutput case the F1(microaveraged) and F1(macroaveraged) can be given by the following equations.


<img src='images/f1_score_multiple.png'></img>

**Reference:** <a href="http://machinelearning.wustl.edu/mlpapers/paper_files/icml2004_GaoWLC04.pdf" target=_blank> A MFoM Learning Approach to Robust Multiclass Multi-Label Text Categorization</a>

As scikit-learn does not provide a built-in metrics we have to write a custom function to implement the above. The R is recall and P denotes precision for a class i in N labels.

In [10]:
#Function to compute F1_Score (microaveraged)
def compute_f1_score_micro(y_true, y_predicted):
    '''
    Description : Computes and returns F1 score microaveraged
    Arguments:
                 y_true:       True value
                 y_ predicted: Predicted values 
    '''
    TP,FP,TN,FN=0,0,0,0
    TP_sum,FP_sum,TN_sum,FN_sum=0,0,0,0
    for column_index in range(y_true.shape[1]):
        true_values = np.array(y_true)[:,column_index]
        predicted_values = np.array(y_predicted)[:,column_index]
        for index in range(len(true_values)):
            if(true_values[index]==predicted_values[index]==1):
                TP += 1
            elif(true_values[index]==predicted_values[index]==-1):
                TP += 1
            elif(true_values[index]==0 and predicted_values[index]!=0):
                FP += 1
            elif(true_values[index]!=0 and predicted_values[index]==0):
                FN += 1
            else:
                TN += 1   
        TP_sum = TP_sum + TP
        FP_sum = FP_sum + FP
        TN_sum = TN_sum + TN
        FN_sum = FN_sum + FN
    return ((2*TP_sum)/(FP_sum+FN_sum+(2*TP_sum)))

We also need to evaluate the sentiment polarity of the aspects. In order to evaluate the polarity we are using the same strategy as given in
<a href="http://galanisd.github.io/Papers/2015SemEval_ABSA_overview.pdf" target="_blank"> SemEval-2015 Task 12: Aspect Based Sentiment Analysis </a>.
<br/> Which defines the _polarity accuracy as the number of correctly predicted polarity labels of aspect categories, divided by the total number of aspect categories.**Note that we are not using neutral sentiment. The score is only for positive or negative sentiments**_

In [11]:
#Function to compute accuracy of polarity prediction
def compute_polarity_accuracy_score(y_true, y_predicted):
    '''
    Description : Compute the accuracy of the polarity
    Arguments : 
                   y_true :   True value
                   y_ predicted: Predicted values 
    '''
    correct = 0
    total = 0
    for column_index in range(y_true.shape[1]):
        true_values = np.array(y_true)[:,column_index]
        predicted_values = np.array(y_predicted)[:,column_index]
        for index in range(len(true_values)):
            if(true_values[index]==predicted_values[index]==1):
                correct += 1
                total += 1
            elif(true_values[index]==predicted_values[index]==-1):
                correct += 1
                total += 1
            elif(true_values[index]==1 and predicted_values[index]==-1):
                total += 1
            elif(true_values[index]==-1 and predicted_values[index]==1):
                total += 1
            else:
                pass
    return (correct/total)
    

In [12]:
#Make scoring function
f1_scorer = make_scorer(compute_f1_score_micro, greater_is_better=True)

### 3) Baseline Classifer (RandomForest with CountVectorizer)

In [20]:
#RandomForest Classifier
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('clf', MultiOutputClassifier(RandomForestClassifier(max_depth=3,class_weight='balanced')))])
#RandomForest specific parameters
parameters = {
'clf__estimator__n_estimators': [100,300,350]
}
#Find the optimal parameters for RandomForest
model_parameter_selection = GridSearchCV(pipeline,param_grid=parameters,cv=5,scoring = f1_scorer)
model_parameter_selection.fit(X_train, y_train)
print("Best Estimator Parameters are:",model_parameter_selection.best_params_, model_parameter_selection.best_score_)
y_predicted = model_parameter_selection.predict(X_test)
f1_micro=compute_f1_score_micro(y_test,y_predicted)
polarity_accuracy = compute_polarity_accuracy_score(y_test,y_predicted)

Best Estimator Parameters are: {'clf__estimator__n_estimators': 350} 0.718798696709


In [18]:
print('F1-Score for classifier: ',f1_micro)
print('Accuracy Score for sentiment polarity: ',polarity_accuracy)

F1-Score for classifier:  0.7170465807730426
Accuracy Score for sentiment polarity:  0.8296943231441049


In [15]:
#Function to find aspects and sentiment polarity given a review
def analyze_review(classifier, review_text):
    '''
    Description : Detects Aspects and finds the polarities given review text
    Arguments : 
                review_text : A string with review sentence/sentences
                classifier: A model used to predict the aspects and polarities
    '''
    dictionary = {'review':clean_text_data(review_text)}
    df_input=pd.DataFrame(dictionary,index=np.arange(len(dictionary.keys())))
    output = classifier.predict(df_input['review'])
    output_to_labels(output)
    

In [19]:
classifier = model_parameter_selection
review_text = "worst service i ever had  a group of   of us went there for sunday brunch and sat outside    everyone that sat in the back outside agreed that it was the worst service we had ever received    our waiter was non existent and after our food finally arrived over an hour after we ordered  we were not given any water or utensils    i complained to the manager  but he was not even apologetic    i will never return again"
analyze_review(classifier,"The food was terrible as far as I am concerned")

RESTAURANT#GENERAL :   positive
FOOD#QUALITY :   positive


### 4) Word Embeddings

Word Embeddings are dense low dimensional representation of words. Word Embeddings convert words from a vocabulary into vectors of real numbers. Word Embeddings have been used in many Sentiment Analysis tasks and have been found to be quite effective. An example model is given in <a href="http://www.cs.ubc.ca/~rjoty/paper/emnlp-paper-drnn-cr.pdf" target=_blank>Fine-grained Opinion Mining with Recurrent Neural Networks and Word Embeddings</a>. Word Embeddings or Vector Space Models (VSM) place words with semantic similarity nearby in the vector space.  

Although we can use Word Embeddings created from generic corpuses. It is a good idea to generate a domain specific Word Embeddings model as shown in <a href="http://nlp.stanford.edu/pubs/hamilton2016inducing.pdf">Inducing Domain-Specific Sentiment Lexicons from Unlabeled Corpora</a> 

Yelp has made public a huge dataset as part of its data challenge program. We used the scripts available at <a href="https://github.com/titipata/yelp_dataset_challenge" target=_blank>Yelp dataset challenge scripts</a> to create Word Embeddings. The scripts do not work with the latest version of Tensorflow (r0.12) and Python 3 so we had to modify them slightly. The updated scripts are available at <a href="https://github.com/umairacheema/yelp_dataset_challenge/tree/hotfix/python3x"target=_blank>Yelp Util for Tensorflow r0.12</a>

The jupyter notebook to build domain specific word embedding for Restaurants is given in the aforementioned github repository as well. There are **21,892** restaurants and **9,90,627** reviews in the corpus used to build domain specific WordEmbeddings.

In [34]:
#The following code will not run in notebook as it requires the yelword2vec
#created using code in 
#https://github.com/umairacheema/yelp_dataset_challenge/blob/hotfix/python3x/examples/domain_specific_word_embeddings.ipynb

from gensim.models.word2vec import Word2Vec
w2vmodel = Word2Vec.load_word2vec_format('data/yelpword2vec', binary=False)


0


In [165]:
#Function to compute vectorized representation of a single review
#Review should be a list of words
def vectorize_review_w2v(w2vmodel,review,size=100):
    '''
    Description : Given word2vec model and review compute average
                  feature vector
    Arguments :
                 w2vmodel : Trained word2vec model
                 review : A single review as a list of words
    '''
    #Clean the review data
    review = clean_text_data(review)
    #Convert review into a list of words.
    text = review.split()
    word_count = 0
    vector = np.zeros(size).reshape((1, size))
    w2vmodel_words = set(w2vmodel.index2word)
    for word in text:
        if word in w2vmodel_words:
            #vector += w2vmodel[word].reshape((1,size))
            vector = np.add(vector,w2vmodel[word])
            word_count += 1
    if word_count>0:
        vector = vector/word_count
    return vector
   

In [173]:
#Function to vectorize multiple reviews using word2vec
def vectorize_reviews_w2v(w2vmodel,reviews,size=100):
    '''
    Description: Given a Pandas Series with reviews, compute dense
                 vectorized representation using Word2Vec
    Arguments :
                w2vmodel : Trained word2vec model
                reviews: A Pandas series with all the reviews.
    '''
    #default value of 100 used by yelp_util
    #Initialize a numpy array to store  feature vector for
    #all reviews
    vectorized_features = np.zeros((len(reviews),size),dtype='float32')
    #Initialize a vector index
    index = 0
    for review in reviews:
        vectorized_features[index] = vectorize_review_w2v(w2vmodel,review,size)
        index += 1
    return vectorized_features

In [167]:
#Convert training and testing features 
# into feature vectors
training_features = vectorize_reviews_w2v(w2vmodel,X_train,size=100)
testing_features = vectorize_reviews_w2v(w2vmodel,X_test)

**As RandomForest does not require Standardized features we are not doing feature scaling or standardization**

### 5) Random Forest with Word2Vec

In [168]:
rfclf = MultiOutputClassifier(RandomForestClassifier(max_depth=3,class_weight='balanced'))
#RandomForest specific parameters
parameters = {
'estimator__n_estimators': [100,300,350]
}
#Find the optimal parameters for RandomForest
rf_parameter_selection = GridSearchCV(rfclf,param_grid=parameters,cv=5,scoring = f1_scorer)
rf_parameter_selection.fit(training_features, y_train)
print("Best Estimator Parameters are:",rf_parameter_selection.best_params_, rf_parameter_selection.best_score_)
rf_y_predicted = rf_parameter_selection.predict(testing_features)
rf_f1_micro=compute_f1_score_micro(y_test,rf_y_predicted)
rf_polarity_accuracy = compute_polarity_accuracy_score(y_test,rf_y_predicted)

Best Estimator Parameters are: {'estimator__n_estimators': 350} 0.697912552508


In [169]:
print(rf_f1_micro,rf_polarity_accuracy)

0.7196082605918671 0.8879668049792531


In [203]:
#Function to test on an unseen short review
def classify_short_review(classifier,w2vmodel,size=100):
    review_text = "I love the food. The service was terrible. I hated the whole thing."
    cleaned = clean_text_data(review_text)
    predicted = classifier.predict(vectorize_review_w2v(w2vmodel,cleaned,size))
    print(output_to_labels(predicted))

In [171]:
output_to_labels(predicted)

RESTAURANT#GENERAL :   negative
FOOD#QUALITY :   negative


### 6) Support Vector Machines with WordEmbeddings

In [109]:
from sklearn.svm import SVC
svmclf = MultiOutputClassifier(SVC(kernel='rbf'))
#Support Vector Machine specific parameters
parameters = {
'estimator__gamma': [0.0001,0.001,0.01,0.1],
'estimator__C':[0.1,1,10,100]
}
#As an additional step we need to standardize the features
training_features_std, testing_features_std, standardizer = standardize_features(training_features,testing_features) 
#Find the optimal parameters for Support Vector Machine Classifier
svm_parameter_selection = GridSearchCV(svmclf,param_grid=parameters,cv=5,scoring = f1_scorer)
svm_parameter_selection.fit(training_features_std, y_train)
print("Best Estimator Parameters are:",svm_parameter_selection.best_params_, svm_parameter_selection.best_score_)
svm_y_predicted = svm_parameter_selection.predict(testing_features_std)
svm_f1_micro=compute_f1_score_micro(y_test,svm_y_predicted)
svm_polarity_accuracy = compute_polarity_accuracy_score(y_test,svm_y_predicted)
print('SVM F1 Score:',svm_f1_micro)
print('SVM Polarity Accuracy:',svm_polarity_accuracy)

Best Estimator Parameters are: {'estimator__C': 10, 'estimator__gamma': 0.001} 0.733305052885
SVM F1 Score: 0.7398895027624309
SVM Polarity Accuracy: 0.9130434782608695


**It is important to note that even with fine tuned Support Vector Machines the test accuracy is not very high. This could mean that we need to improve our WordEmbeddings model. Previously we used a feature size of 300 as that was hardcoded in the yelp util. We also did not add the SemEval datasets while training the word2vec model. We now need to train a new Word2Vector model and train it on the corpus. We shall also try Continuous Bag of Words (CBOW) as well as Skip-n gram based models.**

### 7) Support Vector Machines with Bigger Feature Size and CBOW

In [220]:
#Read the Yelp restaurants reviews data
df_yelp_restaurant_reviews = pd.read_pickle('data/restaurant_reviews.pkl')

In [221]:
#Combine the review data with the Semantic Evaluation data
yelp_reviews_text = df_yelp_restaurant_reviews.text
combined_reviews_text = pd.concat([yelp_reviews_text,X_train],axis=0)
yelp_reviews_text = yelp_reviews_text.to_string(header=False,index=False)
yelp_reviews_text = clean_text_data(yelp_reviews_text)
#Save all reviews in a text file
file = open('data/yelp_semeval_reviews.txt', 'w')
file.write(yelp_reviews_text)
file.close()

In [222]:
import multiprocessing
from gensim.models.word2vec import LineSentence
#Convert it into a format that word2vec can understand
linesentences = LineSentence('data/yelp_semeval_reviews.txt')
#Build the model with higher number of features
n_dims = 500
cbow_model = Word2Vec(linesentences,size=n_dims,workers=multiprocessing.cpu_count())


In [223]:
#Convert training and testing features 
# into feature vectors
training_features_cbow = vectorize_reviews_w2v(cbow_model,X_train,size=n_dims)
testing_features_cbow = vectorize_reviews_w2v(cbow_model,X_test,size=n_dims)


In [240]:
svmclf = MultiOutputClassifier(SVC(kernel='rbf'))
#Support Vector Machine specific parameters
parameters = {
'estimator__gamma': [0.0001,0.001,0.01,0.1],
'estimator__C':[0.1,1,10,100]
}
#As an additional step we need to standardize the features
training_features_std, testing_features_std, standardizer = standardize_features(training_features_cbow,testing_features_cbow) 
#Find the optimal parameters for Support Vector Machine Classifier
svm_parameter_selection = GridSearchCV(svmclf,param_grid=parameters,cv=5,scoring = f1_scorer)
svm_parameter_selection.fit(training_features_std, y_train)
print("Best Estimator Parameters are:",svm_parameter_selection.best_params_, svm_parameter_selection.best_score_)
svm_y_predicted = svm_parameter_selection.predict(testing_features_std)
svm_f1_micro=compute_f1_score_micro(y_test,svm_y_predicted)
svm_polarity_accuracy = compute_polarity_accuracy_score(y_test,svm_y_predicted)
print('SVM F1 Score:',svm_f1_micro)
print('SVM Polarity Accuracy:',svm_polarity_accuracy)

Best Estimator Parameters are: {'estimator__C': 10, 'estimator__gamma': 0.0001} 0.719240597991
SVM F1 Score: 0.7345273180684652
SVM Polarity Accuracy: 0.8938053097345132


In [241]:
review_text ="I did not like this place.The staff was good."
review_text = clean_text_data(review_text)
feature = vectorize_review_w2v(cbow_model,review_text,size=n_dims)
feature_std = standardizer.transform(feature)
predicted= svm_parameter_selection.predict(feature_std)
output_to_labels(predicted)

SERVICE#GENERAL :   negative
RESTAURANT#GENERAL :   negative
FOOD#QUALITY :   negative


In [248]:
from sklearn.naive_bayes import MultinomialNB

lr = MultiOutputClassifier(MultinomialNB())
print(training_features_cbow)
lr.fit(training_features_std, y_train)
lr_y_predicted = lr.predict(testing_features_std)
lr_f1_micro=compute_f1_score_micro(y_test,lr_y_predicted)
lr_polarity_accuracy = compute_polarity_accuracy_score(y_test,lr_y_predicted)
print('SVM F1 Score:',lr_f1_micro)
print('SVM Polarity Accuracy:',lr_polarity_accuracy)

[[ 0.0304029   0.07559045 -0.06514373 ..., -0.04291529 -0.20947297
  -0.04910846]
 [-0.19939972  0.26889768  0.25863898 ...,  0.13018645 -0.16195723
   0.00127879]
 [-0.16938011 -0.06242532 -0.04314443 ..., -0.00968929 -0.28984979
   0.1121956 ]
 ..., 
 [-0.18494466  0.20268759  0.17381716 ...,  0.02723992 -0.21639413
   0.0179843 ]
 [-0.22209762  0.23866831  0.28071001 ...,  0.16685204 -0.08933052
  -0.14263883]
 [-0.19387409  0.23260443  0.1264381  ...,  0.191836   -0.24203971
  -0.05207661]]


ValueError: Input X must be non-negative

### 8) Support Vector Machines with CBOW and TfIdfVectorizer 

### 9) Support Vector Machines with Skip gram, TfIdfVectorizer

### 10) Support Vector Machines with gensim Phrase model  