In [7]:
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import re

from textblob import TextBlob
from sklearn import metrics
from sklearn import svm


In [8]:
input_directory = '/Users/zahra/git/loris_ai_data_challenge/data/'

topic_dict = {'1': 'Ordinary Life', '2': 'School Life', '3': 'Culture_Education',
              '4': 'Attitude_Emotion', '5': 'Relationship', '6': 'Tourism' , '7': 'Health', 
              '8': 'Work', '9': 'Politics', '10': 'Finance'}

action_dict = {'1': 'inform', '2': 'question', '3': 'directive', '4': 'commissive'}

In [9]:
def convert_topic(topic):
    return topic_dict[topic]

In [10]:
def convert_action(act):
    return action_dict[act]

In [26]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


In [27]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B/glove.6B.50d.txt')

In [11]:
def load_conversations(category='train'):
    
    conversations_text = 'dialogues_' + category + '.txt'
    conversations_emotion = 'dialogues_emotion_' + category + '.txt'
    conversations_action = 'dialogues_act_' + category + '.txt'
        
    dial_dir = os.path.join(input_directory+category, conversations_text)
    emo_dir = os.path.join(input_directory+category, conversations_emotion)
    act_dir = os.path.join(input_directory+category, conversations_action)
    
    # Open files
    in_dial = open(dial_dir, 'r')
    in_emo = open(emo_dir, 'r')
    in_act = open(act_dir, 'r')
    
    # build a list of dictionaries: a dictionary per dialogue
    conversations_list = [
        {
            'utterances': utterances,
            'emotions': emotions,
            'actions': actions
        }
        for utterances, emotions, actions in (
            (dialogue.split('__eou__')[:-1], 
             emotion.split(), 
             action.split())
            for dialogue, emotion, action in zip(in_dial, in_emo, in_act)
        )
        if len(utterances) == len(emotions) == len(actions)
    ]
            
    return conversations_list

In [12]:
def find_sentiment(conversation_dict):
    
    conversation_text = conversation_dict['utterances']
    conversation_emotion_blob = []
    
    for sentence in conversation_text:
        blob = TextBlob(sentence)
        conversation_emotion_blob.append(blob.sentiment.polarity)
    conversation_dict['blob_emotions'] = conversation_emotion_blob
    

In [13]:
def find_sentiments_for_all(conversations_list):
    
    for conversation_dict in conversations_list:
        find_sentiment(conversation_dict)

In [14]:
def create_samples(conversations_list):
    
    samples_list = []
    for conversaton_dict in conversations_list:
        for index, utterance in enumerate(conversaton_dict['utterances']):
            if index == 0: continue
                
            prev_emotion = conversaton_dict['blob_emotions'][index-1]
            change_in_emotion = conversaton_dict['blob_emotions'][index] - prev_emotion
            
            samples_list.append({'utterance': utterance, 'prev_emotion': prev_emotion, 
                                'change_in_emotion': change_in_emotion})
            
    return samples_list
    

In [55]:
def sentence_to_avg(sentence, word_to_vec_map):
    
    sentence = re.sub(r'[^\w\s]', ' ', sentence.strip())  
    words = [i.lower() for i in sentence.strip().split()]
    avg = np.zeros((50,))
    
    if len(words) == 0: 
        print(sentence)
        return None
    
    for w in words:
        vec = word_to_vec_map[w] if w in word_to_vec_map else word_to_vec_map['unk'] 
        avg += vec
    avg = avg / len(words)
        
    return avg

In [34]:
def fit_svm_model(x_train, y_train):
    # train svm
    
    clf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1)
    clf.fit(x_train, y_train)
    return clf

In [56]:
def create_features(samples_list):
    
    x, y = [], []
    for sample in samples_list:
        vec = sentence_to_avg(sample['utterance'], word_to_vec_map)
        if vec is None: continue
        x.append(vec)
        y.append(sample['change_in_emotion'])
    
    return x, y

In [18]:
def calculate_eval_metrics(model, x, y):
    # calculate evaluation metrics
    
    score = model.score(x, y)
    predictions = model.predict(x)
    class_report = classification_report(y, predictions)
    print('Error: {0} \n'.format(score))    
    
    return predictions

In [20]:
train_conversations = load_conversations(category='train')
find_sentiments_for_all(train_conversations)

test_conversations = load_conversations(category='test')
find_sentiments_for_all(test_conversations)

validation_conversations = load_conversations(category='validation')
find_sentiments_for_all(validation_conversations)

In [57]:
train_samples = create_samples(train_conversations)
x_train, y_train = create_features(train_samples)

   


In [59]:
print(len(x_train), len(y_train))

76051 76051


In [None]:
model = fit_svm_model(x_train, y_train)