In [11]:
# extract features from train/valid/test files

In [12]:
from pprint import pprint
from features.cosine_similarity import cosine_similarity
from features.content_features import *
from features.user_features import *
from features.structural_features import *
from features.sentiment_features import *
# from data_helper import *
import nltk
nltk.download('vader_lexicon')
import pandas as pd
 

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\wangs\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
train_file = '../data/Experiment/Train1.tsv'
valid_file = '../data/Experiment/Valid1.tsv'
test_file = '../data/Experiment/Test1.tsv'
data_file = '../data/Experiment/Data1.tsv'

In [14]:
def init_tf_idf_dict(idf_file):
    term_to_idf_dict = {}

    with open(idf_file, encoding='utf-8') as fin:
        next(fin)
        for line in fin:
#             print(line)
            tokens = line.split('\t')
            term_to_idf_dict[tokens[1]] = float(tokens[2].strip('\n'))

    return term_to_idf_dict

In [15]:
idf_file = '../data/idf.tsv'

train_feat_file = '../data/Experiment/train_features.csv'
valid_feat_file = '../data/Experiment/valid_features.csv'
test_feat_file = '../data/Experiment/test_features.csv'

pos_file = '../data/positive-words.txt'
neg_file = '../data/negative-words.txt'
term_to_idf_dict = init_tf_idf_dict(idf_file)
pos_dict, neg_dict = load_sentiment_lexicon(pos_file, neg_file)
labelTable={1:'OQ',2:'RQ',3:'CQ',4:'FD',5:'FQ',\
            6:'IR',7:'PA',8:'PF',9:'NF',10:'GG',11:'JK',12:'O'}


In [16]:
def extract_features(in_file):
    with open(in_file, encoding='cp437') as fin:
        utterances = []
        labels = []
        uoas = []
        dialogIDs = []
        # skip the first line
        next(fin)
        for line in fin:
            if line != '\n':
                tokens = line.strip().split('\t')
                utterances.append(tokens[1])
    #                 print(tokens[2])
                dialogIDs.append(tokens[2])
                labels.append(tokens[3])
                uoas.append(tokens[4])


    # extract features
    label_features=[]                
    # content based features
    title_sim, init_sim, thread_sim = cosine_similarity("", utterances, term_to_idf_dict)
    qm = question_mark(utterances)
    dup = duplicate(utterances)
    wh = W5H1(utterances)

    # structural features
    abs_pos = [idx + 1 for idx in range(len(utterances))]
    norm_pos = [pos / len(utterances) for pos in abs_pos]
    length, unique_length, unique_stemmed_length = post_length(utterances)

    # user features
    #                 ua = user_auth(affiliations)
    is_starter = [1 if uoa == 'User' else 0 for uoa in uoas]

    # sentiment based features
    thx = thank(utterances)
    exclam_mark = exclamation_mark(utterances)
    vf = ve_feedback(utterances)
    ss = sentiment_scores(utterances)
    lexicon_counts = lexicon(utterances, pos_dict, neg_dict)

    for i, utterance in enumerate(utterances):
        label_feature = '{}\t{:.4f} {:.4f} {} {} {} {} {:.4f} {} {} {} {} {} {} {} {} {}\n'.format(
            labels[i],
            init_sim[i],
            thread_sim[i],
            qm[i],
            dup[i],
            ' '.join(wh[i]),
            abs_pos[i],
            norm_pos[i],
            length[i],
            unique_length[i],
            unique_stemmed_length[i],
            is_starter[i],
            thx[i],
            exclam_mark[i],
            vf[i],
            ' '.join(ss[i]),
            ' '.join(lexicon_counts[i]),
        )
        label_features += [label_feature]

    output = pd.DataFrame()
    output['utterances'] = utterances
    output['dialogIDs'] = dialogIDs
    output['label_features'] = label_features
    output.head()
    return output


In [17]:
Test = extract_features(test_file)
Train = extract_features(train_file)
Valid = extract_features(valid_file)

In [18]:
# Test.to_csv(test_feat_file, sep='\t', index=False, header=False)
Test.to_csv(test_feat_file)

In [19]:
Train.to_csv(train_feat_file)

In [20]:
Valid.to_csv(valid_feat_file)