In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score
from sacred import Experiment
from sacred.observers import MongoObserver
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from lda import lda
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
import string

In [2]:
X_train = pd.read_csv("datasets/restaurants/train_task1.csv",converters={'TargetWordList':eval,'OpinionCategoryList':eval,'OpinionSubcategoryList':eval,'OpinionPolarityList':eval})
X_test = pd.read_csv("datasets/restaurants/gold_1.csv",converters={'TargetWordList':eval,'OpinionCategoryList':eval,'OpinionSubcategoryList':eval,'OpinionPolarityList':eval})

In [3]:
## Remove sentences with no opinion
X_train = X_train[X_train.OpinionCategoryList.map(lambda x: len(x))!=0]
X_test = X_test[X_test.OpinionCategoryList.map(lambda x: len(x))!=0]

In [4]:
unique_categories= ['RESTAURANT', 'LOCATION', 'AMBIENCE', 'SERVICE', 'FOOD' ,'DRINKS']

In [5]:
## Create aspect sentiment list based on the first occurence of the aspect and the corresponding sentiment
for c in unique_categories:
    X_train[c+"_Sentiment"] = X_train.apply(lambda x:-1 if c not in x['OpinionCategoryList'] else x['OpinionPolarityList'][x['OpinionCategoryList'].index(c)],axis=1)
    X_test[c+"_Sentiment"] = X_test.apply(lambda x:-1 if c not in x['OpinionCategoryList'] else x['OpinionPolarityList'][x['OpinionCategoryList'].index(c)],axis=1)

In [6]:
# Hardcode unique categories 
mlb = MultiLabelBinarizer(classes=unique_categories)
#labels with 0 and 1 added in the dataset

categories_columns = pd.DataFrame(mlb.fit_transform(X_train.OpinionCategoryList),columns=mlb.classes_, index=X_train.index)
test_categories_columns = pd.DataFrame(mlb.fit_transform(X_test.OpinionCategoryList),columns=mlb.classes_, index=X_test.index)
X_train = pd.concat([X_train,categories_columns],axis=1)
X_test = pd.concat([X_test,test_categories_columns],axis=1)


In [7]:
X_train, X_val = train_test_split(X_train, test_size=0.25, shuffle=True, random_state=7)

In [8]:
def preprocess_dataframe_sentence(dataframe):
    df =  dataframe.copy()
    df['Preprocessed_Sentence'] = df.Sentence.copy().map(lambda x:x.strip()) #remove spaces from the begining and the end
    df['Preprocessed_Sentence'] = df.Preprocessed_Sentence.copy().map(lambda x:x.lower()) #lower case latters
    df['Preprocessed_Sentence'] = df.Preprocessed_Sentence.copy().map(lambda sentence: sentence.translate(sentence.maketrans('','', string.punctuation))) #punctuation
    df['Preprocessed_Sentence'] = df.Preprocessed_Sentence.copy().map(lambda sentence: sentence.translate(sentence.maketrans('','', '1234567890')))    #numbers
    df['Preprocessed_Sentence'] = df.Preprocessed_Sentence.copy().map(lambda sentence: word_tokenize(sentence))
    df['Preprocessed_Sentence'] = df.Preprocessed_Sentence.copy().map(lambda sentence:" ".join([word for word in sentence if word not in stopwords.words('english')]))  # remove stop words
    return df

In [9]:
class PreproccesingTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

In [10]:
class MyTfIdfTransformer():
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer (max_features=3000, min_df=3, max_df=0.8, ngram_range=(1,3), stop_words=stopwords.words('english'))
        

    def transform(self, column, **transform_params):
        result =  self.tfidf_vectorizer.transform(column.tolist()).toarray()
        return result

    def fit(self, column, y=None, **fit_params):
        self.tfidf_vectorizer.fit(column.tolist())
        return self

In [11]:
class Count_vectorizer_Transformer():   # max fetaures? min max_df?????
    def __init__(self):
        self.count_vectorizer = CountVectorizer (stop_words=stopwords.words('english'),ngram_range=(1,2))
        

    def transform(self, column, **transform_params):
        result =  self.count_vectorizer.transform(column.tolist()).toarray()
        return result

    def fit(self, column, y=None, **fit_params):
        self.count_vectorizer.fit(column.tolist())
        return self

In [12]:
class hashing_Transformer():   # max fetaures? min max_df?????
    def __init__(self):
        self.hashing_vectorizer = HashingVectorizer (stop_words=stopwords.words('english'),ngram_range=(1,2))
        

    def transform(self, column, **transform_params):
        result =  self.hashing_vectorizer.transform(column.tolist()).toarray()
        return result

    def fit(self, column, y=None, **fit_params):
        self.hashing_vectorizer.fit(column.tolist())
        return self

In [13]:
class MyLdaTransformer():
    def __init__(self, n_topics, random_state, n_iter) :
        self.count_vectorizer = None
        self.n_topics = n_topics
        self.lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_state, refresh=400)
        
    def transform(self, column, **transform_params):
        
        column_tr = self.count_vectorizer.transform(column.tolist())
        result = np.nan_to_num(self.lda_model.transform(column_tr),posinf=1.0,neginf=0.0)
        if np.max(result)>1:
            result = np.clip(result,0,1)
        return result

    def fit(self, column, y=None, **fit_params):
        token_list=[]
        for sentence in column.tolist(): 
            tokens = word_tokenize(sentence)
            token_list.extend(tokens)
        vocab = tuple(set(token_list))  
        self.count_vectorizer = CountVectorizer(vocabulary=vocab)
        corpus_tr = self.count_vectorizer.transform(column.tolist())
        self.lda_model.fit(corpus_tr)
        
        return self

In [14]:
ex = Experiment('jupyter_ex', interactive=True)
#ex.observers.append(MongoObserver(url="localhost:27017",db_name="sacred_experiments"))

In [15]:
#ASPECT CONFIGURATION


@ex.config
def aspect_cfg():
    n_topics=6
    column_transformers=["tfidf_pipeline"]
    random_state = 10
    n_splits=16
    X_train= X_train
    X_val = X_val
    X_test = X_test
    n_estimators=200
    selected_aspect_classifier = "linearsvc"
    selected_polarity_classifier = "random_forest"
    unique_categories= ['RESTAURANT', 'LOCATION', 'AMBIENCE', 'SERVICE', 'FOOD' ,'DRINKS']
    
    
@ex.main
def run(X_train, X_val, X_test,n_topics,column_transformers,random_state, n_splits, n_estimators, selected_aspect_classifier, selected_polarity_classifier, unique_categories):
    pipelines={"hashing_pipeline": (Pipeline([("hashing_vectorizer", hashing_Transformer())]),"Preprocessed_Sentence"),
        "vectorizer_pipepilne": (Pipeline([("vectorizer",Count_vectorizer_Transformer())]), "Preprocessed_Sentence"),
        "tfidf_pipeline":(Pipeline([('tfidf', MyTfIdfTransformer())]),'Preprocessed_Sentence'), 
                "lda_pipeline": (Pipeline([('lda', MyLdaTransformer(n_topics=n_topics,random_state=random_state,n_iter=20000))]),'Preprocessed_Sentence')
                      }
    

    aspect_classifiers = {"linearsvc":OneVsRestClassifier(LinearSVC(random_state=random_state)),
                         "mlp": MLPClassifier(random_state=random_state, max_iter=300, hidden_layer_sizes=(100,50,100), solver="adam", verbose=True,activation="tanh"),
                         "randomforest": RandomForestClassifier(n_estimators=200)
                         }
    
    ct= ColumnTransformer([(c,pipelines[c][0],pipelines[c][1]) for c in column_transformers])
    
    aspect_classifier = aspect_classifiers[selected_aspect_classifier]
    
    aspect_pipeline = Pipeline(steps=[('Preprocessing', PreproccesingTransformer(preprocess_dataframe_sentence)),
                               ("ColumnTransformer", ct),
                                ("Classifier", aspect_classifier)
                               ])
    
    
    
    y_train = np.asarray(mlb.fit_transform(X_train.OpinionCategoryList.tolist()))
    y_val = np.asarray(mlb.fit_transform(X_val.OpinionCategoryList.tolist()))
    y_test = np.asarray(mlb.fit_transform(X_test.OpinionCategoryList.tolist()))

    aspect_pipeline.fit(X_train,y_train)
    
    val_y_pred = aspect_pipeline.predict(X_val)
    val_acc = accuracy_score(y_val,val_y_pred)
    val_f1_micro = f1_score(y_val,val_y_pred, average="micro")

    test_y_pred = aspect_pipeline.predict(X_test)
    test_acc = accuracy_score(y_test,test_y_pred)
    test_f1_micro = f1_score(y_test,test_y_pred, average="micro")


    return val_acc, val_f1_micro, test_acc, test_f1_micro
            



In [16]:
r=ex.run()

INFO - jupyter_ex - Running command 'run'
INFO - jupyter_ex - Started
INFO - jupyter_ex - Result: (0.6206088992974239, 0.7436440677966102, 0.5604770017035775, 0.7154088050314467)
INFO - jupyter_ex - Completed after 0:00:03


In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
glove_path = "glove.840B.300d.txt"
rest_emb_path = "domain_embedding/restaurant_emb.vec"

embeddings_dictionary = dict()

with open(glove_path, encoding="utf8") as glove_file:

    i=0
    for line in glove_file:
        line = line.split(" ")
        word = line[0]
        try:
            vector = np.asarray(line[1:], dtype='float32')
            embeddings_dictionary[word] = vector
        except:
            print(line)

restaurant_embeddings_dictionary = dict()

with open(rest_emb_path, encoding="utf8") as rest_emb:

    i=0
    for line in rest_emb:
        line = line.strip().split(" ")
        word = line[0]
        try:
            vector = np.asarray(line[1:], dtype='float32')
            restaurant_embeddings_dictionary[word] = vector
        except:
            print(line)

In [25]:
tokenizer = Tokenizer(oov_token='<unk>')
train_preprocessed = preprocess_dataframe_sentence(X_train)
val_preprocessed = preprocess_dataframe_sentence(X_val)
test_preprocessed = preprocess_dataframe_sentence(X_test)

In [27]:
tokenizer.fit_on_texts(train_preprocessed.Preprocessed_Sentence)
vocab_size = len(tokenizer.word_index) + 1
maxlen = max(train_preprocessed.Preprocessed_Sentence.map(lambda x: len(word_tokenize(x))))

In [29]:
x_train = tokenizer.texts_to_sequences(train_preprocessed.Preprocessed_Sentence)
x_val = tokenizer.texts_to_sequences(val_preprocessed.Preprocessed_Sentence)
x_test = tokenizer.texts_to_sequences(test_preprocessed.Preprocessed_Sentence)


x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=maxlen)
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, padding='post', maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, padding='post', maxlen=maxlen)

y_train = np.asarray(mlb.fit_transform(train_preprocessed.OpinionCategoryList.tolist()))
y_val = np.asarray(mlb.fit_transform(val_preprocessed.OpinionCategoryList.tolist()))
y_test = np.asarray(mlb.fit_transform(test_preprocessed.OpinionCategoryList.tolist()))

In [32]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
extended_embedding_matrix = np.zeros((vocab_size, 400))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    domain_embedding_vector = restaurant_embeddings_dictionary.get(word)
    if domain_embedding_vector is not None:
        extended_embedding_matrix[index][300:] = domain_embedding_vector
    if embedding_vector is not None:
        extended_embedding_matrix[index][:300] = embedding_vector

In [34]:
emb_inputs = tf.keras.layers.Input(shape=(maxlen,))
embedding_layer = tf.keras.layers.Embedding(vocab_size,300, 
                                           embeddings_initializer=tf.constant_initializer(embedding_matrix),
                                            trainable=False)(emb_inputs)

bdlstm_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embedding_layer)
dropout_1 = tf.keras.layers.SpatialDropout1D(0.3)(bdlstm_1)
bdlstm_2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(dropout_1)
dropout_2 = tf.keras.layers.Dropout(0.25)(bdlstm_2)
dense_layer_1 = tf.keras.layers.Dense(128, activation="relu")(dropout_2)
dense_layer_2 = tf.keras.layers.Dense(len(unique_categories), activation="sigmoid")(dense_layer_1)

In [35]:
model = tf.keras.Model(inputs=emb_inputs, outputs=dense_layer_2)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 35)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 35, 300)           762900    
_________________________________________________________________
bidirectional (Bidirectional (None, 35, 256)           439296    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 35, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               164352    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16512 

In [45]:
history = model.fit(x_train,  y_train, validation_data=(x_val, y_val), batch_size=32, epochs=3, verbose=1)

In [47]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'],label="Train acc")
plt.plot(history.history['val_acc'],label="Val acc")

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.show()

plt.plot(history.history['loss'],label="Train loss")
plt.plot(history.history['val_loss'],label="Val loss")

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.show()

In [48]:
y_hat = (model.predict(x_test) > 0.5)*1
print(accuracy_score(y_test,y_hat))
print(f1_score(y_test,y_hat,average='micro'))