## 构造问答对（QA对）的数据，以及它们对应的情绪，保存到csv文件中
## Construct QA pairs, along with their corresponding emotions, and save them in CSV files

In [1]:
import re, random, time
import numpy as np
from sklearn.model_selection import train_test_split  # 数据集的划分
import pandas as pd
import pickle
import os, sys
from collections import Counter, defaultdict
import glob
import python_speech_features as ps
import wave
import jieba
import keras as kr

Using TensorFlow backend.


In [32]:
def create_QA_data(emotion_dir, dialog_dir):
    # 读取情绪标签的文件
    df_e = pd.read_csv(emotion_dir, sep="\t", names=['a','ses','emotion','b'])
    df_e = df_e[1:]
    df_e = df_e[df_e['ses'].str.contains('Ses')]
    df_e.index = range(len(df_e))
    df_e = df_e.drop(['a', 'b'], axis=1)
    
    # 读取对话文本文件
    df = pd.read_csv(dialog_dir, sep="\t", names=['utterance'])
    df['gender'] = [0] * len(df)
    df['ses'] = [0] * len(df)
    # 去除txt文件中的坏数据
    df = df[df['utterance'].str.contains('Ses')]
    df = df[df['utterance'].str[17] != 'X']
    df.index = range(len(df))
    
    for i in range(len(df)):
        df['gender'][i] = df['utterance'][i][15]
        df['ses'][i] = df['utterance'][i][:19]
        df['utterance'][i] = df['utterance'][i][41:]
    df = df[['ses','gender','utterance']]
    
    q_list = []
    a_list = []
    re_list = []
    qe_list = []
    for i in range(len(df) - 1):
        # 分割对话为一个个问答对（如果前后两句话是同一个人说的则不算）
        if(df['gender'][i] != df['gender'][i+1]):
            re = df_e[df_e['ses'] == df['ses'][i+1]]['emotion'].item()
            qe = df_e[df_e['ses'] == df['ses'][i]]['emotion'].item()
            # 对于answer情绪为xxx的，直接丢掉,question的情绪可以是xxx，因为question的情绪用不到，而answer情绪是input
            if re == 'xxx':
                continue
            # 将excited算为happy
            if re == 'exc':
                re = 'hap'
            # 将excited算为happy
            if qe == 'exc':
                qe = 'hap'
            q_list.append(df['utterance'][i])
            a_list.append(df['utterance'][i+1])
            re_list.append(re)
            qe_list.append(qe)
    qa_dict = {'Q': q_list,'A': a_list,'QE': qe_list,'AE': re_list} # QE: question的emotion; AE: answer的emotion
    qa_df = pd.DataFrame(data = qa_dict)
    qa_df = qa_df[['Q','A','QE','AE']]
    
    return qa_df

In [33]:
# the position of IEMOCAP data
rootdir = r"E:\\Chrome_dl\\IEMOCAP\\IEMOCAP_full_release"
flag = True
for fileName in os.listdir(rootdir):
    if fileName[0] == 'S':
        sub_dir = rootdir + '\\' + fileName + '\\dialog/transcriptions'
        emoevl = rootdir + '\\' + fileName + '\\dialog/EmoEvaluation'
        for sess in os.listdir(sub_dir):
            emotdir = emoevl + '\\' + sess
            dialogdir = sub_dir + '\\' + sess
            qa_df = create_QA_data(emotdir, dialogdir)
            # 第一次写入文件
            if flag:
                qa_df.to_csv('qa_data_all.csv', index=None)
                flag = False
            else:
                qa_df.to_csv('qa_data_all.csv', mode='a', header=False, index=None)
            print(fileName + ' - ' + sess)
#             print(emotdir)
#             print(dialogdir)
#             print('########################')
print('done!')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Session1 - Ses01F_impro01.txt
Session1 - Ses01F_impro02.txt
Session1 - Ses01F_impro03.txt
Session1 - Ses01F_impro04.txt
Session1 - Ses01F_impro05.txt
Session1 - Ses01F_impro06.txt
Session1 - Ses01F_impro07.txt
Session1 - Ses01F_script01_1.txt
Session1 - Ses01F_script01_2.txt
Session1 - Ses01F_script01_3.txt
Session1 - Ses01F_script02_1.txt
Session1 - Ses01F_script02_2.txt
Session1 - Ses01F_script03_1.txt
Session1 - Ses01F_script03_2.txt
Session1 - Ses01M_impro01.txt
Session1 - Ses01M_impro02.txt
Session1 - Ses01M_impro03.txt
Session1 - Ses01M_impro04.txt
Session1 - Ses01M_impro05.txt
Session1 - Ses01M_impro06.txt
Session1 - Ses01M_impro07.txt
Session1 - Ses01M_script01_1.txt
Session1 - Ses01M_script01_2.txt
Session1 - Ses01M_script01_3.txt
Session1 - Ses01M_script02_1.txt
Session1 - Ses01M_script02_2.txt
Session1 - Ses01M_script03_1.txt
Session1 - Ses01M_script03_2.txt
Session2 - Ses02F_impro01.txt
Session2 - Ses02F_impro02.txt
Session2 - Ses02F_impro03.txt
Session2 - Ses02F_impro04.tx