In [None]:
# Load datasets from google drive
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import json
import jieba
import re
import codecs
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
# Load cleand and simplified data
dataset_dir = "/content/drive/MyDrive/NLP_Final_Project/"
def load_data():
  json_files = [dataset_dir + "datasets/simplify_json/train_split.json", 
                dataset_dir + "datasets/simplify_json/valid_split.json", 
                dataset_dir + "datasets/simplify_json/test_split.json"]
  data_files = []
  for json_file in json_files:
      with open(json_file, 'r') as f:
          data_files.append(json.load(f))

  return data_files

In [4]:
# covert big-5 ['Openness', 'Conscientiousness', 'Extraversion','Agreeableness', 'Neuroticism'] to 0:low 1:high
def one_hot_encode_big5(input_data):
  for i in range(len(input_data)):
    for j in range(len(input_data[i])):
        input_data[i][j]['big-5'] = [0 if input_data[i][j]['Openness'] == 'low' else 1,
                                     0 if input_data[i][j]['Conscientiousness'] == 'low' else 1,
                                     0 if input_data[i][j]['Extraversion'] == 'low' else 1,
                                     0 if input_data[i][j]['Agreeableness'] == 'low' else 1,
                                     0 if input_data[i][j]['Neuroticism'] == 'low' else 1]
        del input_data[i][j]['Openness']
        del input_data[i][j]['Conscientiousness']
        del input_data[i][j]['Extraversion']
        del input_data[i][j]['Agreeableness']
        del input_data[i][j]['Neuroticism']
    converted_data = input_data
  return converted_data

In [5]:
# Get (X_train y_train), (X_valid, y_valid) from converted data
def get_X_y(converted_data):
  X_train = []   # name, Utterance
  y_train = []   # big-5
  X_valid = []    # name, Utterance
  y_valid = []    # big-5
  X_test = []   # name, Utterance
  y_test = []   # big-5
  speakers_train, speakers_valid, speakers_test= [],[],[]

  for i in range(len(converted_data[0])):
    X_train.append((converted_data[0][i]['Speaker'], converted_data[0][i]['Utterance']))
    y_train.append(converted_data[0][i]['big-5'])
    speakers_train.append(converted_data[0][i]['Speaker'])
  speakers_train = list(set(speakers_train))
  print("There are {} speakers in train_data" .format(len(speakers_train)))

  for i in range(len(converted_data[1])):
    X_valid.append((converted_data[1][i]['Speaker'], converted_data[1][i]['Utterance']))
    y_valid.append(converted_data[1][i]['big-5'])
    speakers_valid.append(converted_data[1][i]['Speaker'])
  speakers_valid = list(set(speakers_valid))
  print("There are {} speakers in valid_data" .format(len(speakers_valid)))

  for i in range(len(converted_data[2])):
    X_test.append((converted_data[2][i]['Speaker'], converted_data[2][i]['Utterance']))
    y_test.append(converted_data[2][i]['big-5'])
    speakers_test.append(converted_data[2][i]['Speaker'])
  speakers_test = list(set(speakers_test))
  print("There are {} speakers in test_data" .format(len(speakers_test)))

  return X_train, y_train, X_valid, y_valid, X_test, y_test, speakers_train, speakers_valid, speakers_test

In [6]:
# combine same speaker data to one data and only keep different Utterance and one speaker name
def combine_same_speaker_data(speakers, X, y):  
    X_combine = []
    y_combine = []
    for speaker in speakers:
        X_combine.append({speaker:[]})
        y_combine.append({speaker:[]})

    for i in range(len(X)):
        for j in range(len(X_combine)):
            if X[i][0] in X_combine[j]:
                X_combine[j][X[i][0]].append(X[i][1])
                y_combine[j][X[i][0]].append(y[i])

    for i in range(len(y_combine)):
        for key in y_combine[i]:
            y_combine[i][key] = y_combine[i][key][0]

    
    return X_combine, y_combine

In [7]:
data_files = load_data()
data_files[0][0]
converted_data = one_hot_encode_big5(data_files)
converted_data[0][0]

{'Speaker': '童文洁', 'Utterance': '真巧', 'big-5': [0, 1, 1, 0, 1]}

In [8]:
X_train, y_train, X_valid, y_valid, X_test, y_test, speakers_train, speakers_valid, speakers_test = get_X_y(converted_data)

There are 220 speakers in train_data
There are 33 speakers in valid_data
There are 32 speakers in test_data


In [9]:
X_train_combined, y_train_combined = combine_same_speaker_data(speakers_train, X_train, y_train)
X_valid_combined, y_valid_combined = combine_same_speaker_data(speakers_valid, X_valid, y_valid)
X_test_combined, y_test_combined = combine_same_speaker_data(speakers_test, X_test, y_test)

In [10]:
# Use jieba tokenizer
def tokenize_data(speaker, X):
  X_train_tokenized = X
  for speaker in X_train_tokenized:
    for key in speaker:
      for i in range(len(speaker[key])):
          seg_list = jieba.cut(speaker[key][i], cut_all=False)
          speaker[key][i] = " ".join(seg_list)
          if i > 0:
            speaker[key][0] = speaker[key][0] + " " +speaker[key][i]
    for key in speaker:
        for j in reversed(range(1, len(speaker[key]))):
            # delete other key's value
            del speaker[key][j]
    for key in speaker:
        # Remove non-alphanumeric characters (except underscores) and convert to lowercase
        speaker[key][0] = re.sub(r'[^\w\s_]', '', speaker[key][0])
        # Replace multiple whitespace characters with a single space
        speaker[key][0] = re.sub(r'\r\s+', ' ', speaker[key][0])
  return X_train_tokenized

In [11]:
X_train_tokenized = tokenize_data(speakers_train, X_train_combined)
X_valid_tokenized = tokenize_data(speakers_valid, X_valid_combined)
X_test_tokenized = tokenize_data(speakers_test, X_test_combined)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\E48D~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.532 seconds.
Prefix dict has been built successfully.


In [12]:
# Remove speaker from X and get the value of y (O of big-5)
def clean_X_y(X_tokenized, y):
  X_cleaned = []
  y_cleaned = []
  for line in X_tokenized:
    for speaker in line:
          X_cleaned.append(line[speaker][0])
  for line in y:
    for speaker in line:
      # Change i in (line[speaker][i]) to get different y in OCEAN: {0:O, 1:C, 2:E, 3:A, 4:N }
      y_cleaned.append(line[speaker])
            
  return X_cleaned, pd.DataFrame(y_cleaned, columns=['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism'])

In [13]:
X_train_cleaned, y_train_cleaned = clean_X_y(X_train_tokenized, y_train_combined)
X_valid_cleaned, y_valid_cleaned = clean_X_y(X_valid_tokenized, y_valid_combined)
X_test_cleaned, y_test_cleaned = clean_X_y(X_test_tokenized, y_test_combined)

In [14]:
y_train_cleaned = y_train_cleaned[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']]
y_train_cleaned

Unnamed: 0,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism
0,0,1,0,0,0
1,1,1,0,1,0
2,0,0,1,0,1
3,1,1,0,1,1
4,0,1,0,0,0
...,...,...,...,...,...
215,1,1,0,0,1
216,0,1,0,0,1
217,0,0,1,0,1
218,1,1,0,1,0


In [15]:
# load stopword
def load_stopwords(stopwords_file):
  stopwords = []
  with codecs.open(stopwords_file, 'r', encoding='utf-8', errors='ignore') as fp:
      stopwords = fp.read().split('\r\n')
  return stopwords

In [16]:
stopwords_file     = dataset_dir + "stopwords-master/cn_stopwords.txt"
stop_words = load_stopwords(stopwords_file)

In [17]:
vectorizer = CountVectorizer(stop_words=stop_words)
X_features_train = vectorizer.fit_transform(X_train_cleaned)



In [18]:
from keras import Sequential, optimizers
from keras.layers import Dense, Dropout, LSTM, Embedding, SimpleRNN, Flatten, GRU
from keras.utils import pad_sequences
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [19]:
def binary_label(y):
  for i in range(len(y)):
    for j in range(5):
      if y[i][j] < 0.5:
        y[i][j] = 0
      else:
        y[i][j] = 1
  return y

def labels_acc(y_true, y_pred):
  y_true = y_true.numpy()
  y_pred = y_pred.numpy()
  true_labels = np.argmax(y_true, axis=1)
  pred_labels = np.round(y_pred)
  accuracy = np.mean(np.all(np.equal(true_labels, pred_labels), axis=1))
  return accuracy

In [25]:
X_train = []
for i in range(len(X_train_cleaned)):
    sentence = X_train_cleaned[i]
    lst = sentence.split(' ')
    seq = [word for word in lst if word not in stop_words]
    X_train.append(' '.join(seq))
X_train = pd.DataFrame(X_train, columns=['words'])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['words'])
sequences = tokenizer.texts_to_sequences(X_train['words'])
max_len = 128
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

X_test = []
for i in range(len(X_test_cleaned)):
    sentence = X_test_cleaned[i]
    lst = sentence.split(' ')
    seq = [word for word in lst if word not in stop_words]
    X_test.append(' '.join(seq))
X_test = pd.DataFrame(X_test, columns=['words'])

tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(X_test['words'])
sequences_test = tokenizer_test.texts_to_sequences(X_test['words'])
max_len = 128
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_len, padding='post', truncating='post')

In [29]:
def ANN(X, y):
  model = Sequential()

  model.add(Dense(256, input_dim=padded_sequences.shape[1]))
  model.add(Dropout(0.5))
  model.add(Dense(256, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(90, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(5, activation='sigmoid'))
  
  model.compile(loss='binary_crossentropy', optimizer = optimizers.Adam(), metrics=['accuracy'])
  print(model.summary())

  model.fit(X, y, epochs=50, batch_size=5)
  return model

In [30]:
model = ANN(padded_sequences, np.array(y_train_cleaned))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 256)               65792     
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_9 (Dense)             (None, 256)               65792     
                                                                 
 dropout_7 (Dropout)         (None, 256)               0         
                                                                 
 dense_10 (Dense)            (None, 90)                23130     
                                                                 
 dropout_8 (Dropout)         (None, 90)                0         
                                                                 
 dense_11 (Dense)            (None, 5)                

In [31]:
print("Classifying test data")
predicted_labels = binary_label(model.predict(padded_sequences_test))
y_test_cleaned = np.array(y_test_cleaned)
accuracy_all = np.sum(predicted_labels == y_test_cleaned)/(len(predicted_labels)*len(predicted_labels[0]))
print('Accuracy in all = {}'.format(accuracy_all))
accuracy_person = []
for i in range(len(predicted_labels)):
    acc = np.sum(predicted_labels[i] == y_test_cleaned[i])
    accuracy_person.append(acc)
accuracy_per_person = pd.DataFrame.from_dict(Counter(accuracy_person), orient='index', columns=['count'])
accuracy_per_person.sort_index(inplace=True)
print('Accuracy per person')
print(accuracy_per_person)
print('Precision for macro-label = {}'.format(metrics.precision_score(predicted_labels, y_test_cleaned, average='macro')))
print('Recall    for macro-label = {}'.format(metrics.recall_score(predicted_labels,    y_test_cleaned, average='macro')))
print('Precision for micro-label = {}'.format(metrics.precision_score(predicted_labels, y_test_cleaned, average='micro')))
print('Recall    for micro-label = {}'.format(metrics.recall_score(predicted_labels,    y_test_cleaned, average='micro')))

Classifying test data
Accuracy in all = 0.68125
Accuracy per person
   count
1      1
2      6
3     11
4      7
5      7
Precision for macro-label = 0.8
Recall    for macro-label = 0.575
Precision for micro-label = 0.8598130841121495
Recall    for micro-label = 0.71875


  _warn_prf(average, modifier, msg_start, len(result))


In [220]:
# LSTM
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(5, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.fit(padded_sequences, np.array(y_train_cleaned), epochs=50, batch_size=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x289afca0e50>

In [222]:
print("Classifying test data")
predicted_labels = binary_label(model.predict(padded_sequences_test))
y_test_cleaned = np.array(y_test_cleaned)
accuracy_all = np.sum(predicted_labels == y_test_cleaned)/(len(predicted_labels)*len(predicted_labels[0]))
print('Accuracy in all = {}'.format(accuracy_all))
accuracy_person = []
for i in range(len(predicted_labels)):
    acc = np.sum(predicted_labels[i] == y_test_cleaned[i])
    accuracy_person.append(acc)
accuracy_per_person = pd.DataFrame.from_dict(Counter(accuracy_person), orient='index', columns=['count'])
accuracy_per_person.sort_index(inplace=True)
print('Accuracy per person')
print(accuracy_per_person)
print('Precision for macro-label = {}'.format(metrics.precision_score(predicted_labels, y_test_cleaned, average='macro')))
print('Recall    for macro-label = {}'.format(metrics.recall_score(predicted_labels,    y_test_cleaned, average='macro')))
print('Precision for micro-label = {}'.format(metrics.precision_score(predicted_labels, y_test_cleaned, average='micro')))
print('Recall    for micro-label = {}'.format(metrics.recall_score(predicted_labels,    y_test_cleaned, average='micro')))

Classifying test data
Accuracy in all = 0.68125
Accuracy per person
   count
1      2
2      6
3     10
4      5
5      9
Precision for macro-label = 0.7461172161172162
Recall    for macro-label = 0.7322271755827156
Precision for micro-label = 0.7757009345794392
Recall    for micro-label = 0.7545454545454545


In [249]:
# GRU
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64))
model.add(GRU(32))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(5, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.fit(padded_sequences, np.array(y_train_cleaned), epochs=50, batch_size=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x289bcb31c60>

In [250]:
print("Classifying test data")
predicted_labels = binary_label(model.predict(padded_sequences_test))
y_test_cleaned = np.array(y_test_cleaned)
accuracy_all = np.sum(predicted_labels == y_test_cleaned)/(len(predicted_labels)*len(predicted_labels[0]))
print('Accuracy in all = {}'.format(accuracy_all))
accuracy_person = []
for i in range(len(predicted_labels)):
    acc = np.sum(predicted_labels[i] == y_test_cleaned[i])
    accuracy_person.append(acc)
accuracy_per_person = pd.DataFrame.from_dict(Counter(accuracy_person), orient='index', columns=['count'])
accuracy_per_person.sort_index(inplace=True)
print('Accuracy per person')
print(accuracy_per_person)
print('Precision for macro-label = {}'.format(metrics.precision_score(predicted_labels, y_test_cleaned, average='macro')))
print('Recall    for macro-label = {}'.format(metrics.recall_score(predicted_labels,    y_test_cleaned, average='macro')))
print('Precision for micro-label = {}'.format(metrics.precision_score(predicted_labels, y_test_cleaned, average='micro')))
print('Recall    for micro-label = {}'.format(metrics.recall_score(predicted_labels,    y_test_cleaned, average='micro')))

Classifying test data
Accuracy in all = 0.66875
Accuracy per person
   count
1      2
2      4
3     11
4     11
5      4
Precision for macro-label = 0.7449670329670328
Recall    for macro-label = 0.7153912753912753
Precision for micro-label = 0.7757009345794392
Recall    for micro-label = 0.7410714285714286
