In [65]:
from srs.predictor import loadTrainedPredictor
from srs.utilities import loadUsefulTrainingData
from srs import settings
from srs.Model_Word2Vec import AspectPatterns, distill_dynamic, static_aspect_to_vec
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score,classification_report
from sklearn.utils import column_or_1d

## set up word2vec predictor

In [2]:
w2v_predictor = loadTrainedPredictor('Word2Vec')

## load training sentences

In [3]:
static_traning_data_dir = settings["static_training_data"]
sentences = loadUsefulTrainingData(static_traning_data_dir)

## create feature vec for each sentence

In [50]:
aspectPattern_names = ['adj_nn','nn']
aspectPatterns = AspectPatterns(aspectPattern_names)

In [51]:
df = pd.DataFrame(columns=w2v_predictor.static_aspects_all['static_aspect_list_fortraining'])
target = pd.DataFrame(columns=['Prod_Feat'])

In [52]:
for sen0 in sentences:

    distill_dynamic(sen0, aspectPatterns)

    #transform the sentence's word2vec_features to vectors
    word2vec_features = []
    for item in sen0.word2vec_features_list:
        word2vec_features=word2vec_features + item
    vec_list=[]
    for dynamic_aspect in word2vec_features:
        dynamic_aspect_splitted=dynamic_aspect.split(' ')
        aspect_phrase_vec=[]
        for word in dynamic_aspect_splitted:
            if word in w2v_predictor.model:
                aspect_word_vec=w2v_predictor.model[word]
                aspect_phrase_vec.append(aspect_word_vec)
        if aspect_phrase_vec:
            vec_list.append(aspect_phrase_vec)

    num_static_aspect = len(w2v_predictor.static_aspects_all['static_aspect_list_fortraining'])
    static_wordlist_vec = static_aspect_to_vec(w2v_predictor.static_aspects_all, w2v_predictor.model)
    if vec_list:
        similarity_matrix=np.zeros([len(vec_list),num_static_aspect])
        for i in range(len(vec_list)):
            for j in range(num_static_aspect):   
                similarity_item_matrix=np.zeros([len(vec_list[i]),len(static_wordlist_vec[j])])
                for kk in range(len(vec_list[i])):
                    for ll in range(len(static_wordlist_vec[j])):
                        similarity_item_matrix[kk][ll]=np.dot(vec_list[i][kk],static_wordlist_vec[j][ll])

                similarity_item_row=np.max(similarity_item_matrix,axis=1)
                similarity_item=np.sum(similarity_item_row)
                similarity_matrix[i][j]=similarity_item

    useful_features_vec = np.max(similarity_matrix, axis=0)
    
    row = pd.DataFrame([useful_features_vec],
                       columns = w2v_predictor.static_aspects_all['static_aspect_list_fortraining'])
    df = df.append(row,ignore_index=True)
    
    row_target = pd.DataFrame([sen0.labeled_aspects],columns=['Prod_Feat'])
    target = target.append(row_target,ignore_index=True)

In [53]:
df.head(5)

Unnamed: 0,price,pictures,video,zoom,size,design,battery,screen,detection,ease use,quality
0,0.931658,1.534901,0.995598,1.06262,0.985629,0.835051,0.906312,0.993779,1.261998,1.457244,0.941206
1,0.55725,0.851869,0.598412,1.055173,0.882652,0.864155,1.053778,1.025575,1.176673,0.838404,0.549492
2,1.025858,0.477486,0.51262,1.360526,1.238442,0.918769,1.620101,0.652305,0.967244,1.238442,0.737678
3,0.78582,0.508109,0.550379,1.374234,1.168706,0.787749,1.658028,0.781928,1.009568,1.168706,0.461103
4,0.601522,0.367451,0.363241,0.551944,0.594046,0.432167,0.999162,0.519363,0.609825,0.579602,0.460836


In [54]:
target.head(5)

Unnamed: 0,Prod_Feat
0,pictures
1,size
2,battery
3,battery
4,battery


## set up training ans testing sets

In [71]:
train_idx, test_idx = train_test_split(df.index, test_size=0.25, random_state=42)
X_train  = df.iloc[train_idx]
X_test = df.iloc[test_idx]
y_train = target.iloc[train_idx].values.ravel()
y_test = target.iloc[test_idx].values.ravel()

In [83]:
for kernel in ('linear', 'rbf', 'poly'):
    print "================kernel: {0}=========================".format(kernel)
    w2v_svm = svm.SVC(kernel=kernel, gamma=6)
    w2v_svm.fit(X_train, y_train)
    y_predicted = w2v_svm.predict(X_test)
    target_names = target.Prod_Feat.unique()
    print(classification_report(y_test, y_predicted, target_names=target_names))

             precision    recall  f1-score   support

   pictures       0.57      0.80      0.67        10
       size       0.56      0.62      0.59         8
    battery       0.00      0.00      0.00         6
     screen       0.48      0.74      0.58        19
       zoom       0.58      0.62      0.60        48
ease of use       0.62      0.80      0.70        10
  detection       0.00      0.00      0.00         3
    quality       0.60      0.35      0.44        17
      price       0.60      0.57      0.59        21
     design       1.00      0.33      0.50         6
      video       0.50      0.48      0.49        27

avg / total       0.55      0.56      0.54       175

             precision    recall  f1-score   support

   pictures       0.53      0.90      0.67        10
       size       0.67      0.75      0.71         8
    battery       0.50      0.17      0.25         6
     screen       0.44      0.58      0.50        19
       zoom       0.54      0.77      0.64