<a href="https://colab.research.google.com/github/YiyunLei/NLP-Final-Project/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Load datasets from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import json
import jieba
import re
import codecs
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

In [5]:
# Load cleand and simplified data
dataset_dir = "/content/drive/MyDrive/NLP_Final_Project"
def load_data():
  json_files = [dataset_dir + "/datasets/simplify_json/train_split.json", dataset_dir + "/datasets/simplify_json/valid_split.json"]
  data_files = []
  for json_file in json_files:
      with open(json_file, 'r') as f:
          data_files.append(json.load(f))

  return data_files

In [6]:
# covert big-5 ['Openness', 'Conscientiousness', 'Extraversion','Agreeableness', 'Neuroticism'] to 0:low 1:high
def one_hot_encode_big5(input_data):
  for i in range(len(input_data)):
    for j in range(len(input_data[i])):
        input_data[i][j]['big-5'] = [0 if input_data[i][j]['Openness'] == 'low' else 1,
                                     0 if input_data[i][j]['Conscientiousness'] == 'low' else 1,
                                     0 if input_data[i][j]['Extraversion'] == 'low' else 1,
                                     0 if input_data[i][j]['Agreeableness'] == 'low' else 1,
                                     0 if input_data[i][j]['Neuroticism'] == 'low' else 1]
        del input_data[i][j]['Openness']
        del input_data[i][j]['Conscientiousness']
        del input_data[i][j]['Extraversion']
        del input_data[i][j]['Agreeableness']
        del input_data[i][j]['Neuroticism']
    converted_data = input_data
  return converted_data

In [7]:
# Get (X_train y_train), (X_valid, y_valid) from converted data
def get_X_y(converted_data):
  X_train = []   # name, Utterance
  y_train = []   # big-5
  X_valid = []    # name, Utterance
  y_valid = []    # big-5
  speakers_train, speakers_valid = [],[]
  for i in range(len(converted_data[0])):
    X_train.append((converted_data[0][i]['Speaker'], converted_data[0][i]['Utterance']))
    y_train.append(converted_data[0][i]['big-5'])
    speakers_train.append(converted_data[0][i]['Speaker'])
  speakers_train = list(set(speakers_train))
  print("There are {} speakers in train_data" .format(len(speakers_train)))

  for i in range(len(converted_data[1])):
    X_valid.append((converted_data[1][i]['Speaker'], converted_data[1][i]['Utterance']))
    y_valid.append(converted_data[1][i]['big-5'])
    speakers_valid.append(converted_data[1][i]['Speaker'])
  speakers_valid = list(set(speakers_valid))
  print("There are {} speakers in valid_data" .format(len(speakers_valid)))
  return X_train, y_train, X_valid, y_valid, speakers_train, speakers_valid

In [8]:
# combine same speaker data to one data
def combine_same_speaker_data(speakers, X, y):
    # combine same speaker data to one data and only keep different Utterance and one speaker name
    X_combine = []
    y_combine = []
    for speaker in speakers:
        X_combine.append({speaker:[]})
        y_combine.append({speaker:[]})

    for i in range(len(X)):
        for j in range(len(X_combine)):
            if X[i][0] in X_combine[j]:
                X_combine[j][X[i][0]].append(X[i][1])
                y_combine[j][X[i][0]].append(y[i])

    for i in range(len(y_combine)):
        for key in y_combine[i]:
            y_combine[i][key] = y_combine[i][key][0]
    return X_combine, y_combine

In [9]:
data_files = load_data()
data_files[0][0]
converted_data = one_hot_encode_big5(data_files)
converted_data[0][0]

{'Speaker': '童文洁', 'Utterance': '真巧', 'big-5': [0, 1, 1, 0, 1]}

In [10]:
X_train, y_train, X_valid, y_valid, speakers_train, speakers_valid = get_X_y(converted_data)

There are 220 speakers in train_data
There are 33 speakers in valid_data


In [None]:
speakers_valid

In [None]:
X_valid

In [None]:
X_train_combined, y_train_combined = combine_same_speaker_data(speakers_train, X_train, y_train)
X_valid_combined, y_valid_combined = combine_same_speaker_data(speakers_valid, X_valid, y_valid)
X_train_combined[0]

In [None]:
y_valid_combined[0]

{'叶珊': [0, 1, 0, 0, 1]}

In [13]:
# Use jieba tokenizer
def tokenize_data(speaker, X):
  X_train_tokenized = X
  for speaker in X_train_tokenized:
    for key in speaker:
      for i in range(len(speaker[key])):
          seg_list = jieba.cut(speaker[key][i], cut_all=False)
          speaker[key][i] = " ".join(seg_list)
          if i > 0:
            speaker[key][0] = speaker[key][0] + " " +speaker[key][i]
    for key in speaker:
        for j in reversed(range(1, len(speaker[key]))):
            # delete other key's value
            del speaker[key][j]
    for key in speaker:
        # Remove non-alphanumeric characters (except underscores) and convert to lowercase
        speaker[key][0] = re.sub(r'[^\w\s_]', '', speaker[key][0])
        # Replace multiple whitespace characters with a single space
        speaker[key][0] = re.sub(r'\s+', ' ', speaker[key][0])
  return X_train_tokenized

In [None]:
X_train_tokenized = tokenize_data(speakers_train, X_train_combined)
X_valid_tokenized = tokenize_data(speakers_valid, X_valid_combined)
X_train_tokenized

In [None]:
print(type(X_train_tokenized))

<class 'list'>


In [15]:
# Remove speaker from X and get the value of y(O of big-5)
def clean_X_y(X_tokenized, y):
  X_cleaned = []
  y_cleaned = []
  for line in X_tokenized:
    for speaker in line:
          X_cleaned.append(line[speaker][0])
  for line in y:
    for speaker in line:
          y_cleaned.append(line[speaker][0])
            
  return X_cleaned, y_cleaned

In [None]:
X_train_cleaned, y_train_cleaned = clean_X_y(X_train_tokenized, y_train_combined)
X_valid_cleaned, y_valid_cleaned = clean_X_y(X_valid_tokenized, y_valid_combined)
y_train_cleaned

In [18]:
# load stopword
def load_stopwords(stopwords_file):
  stopwords = []
  with codecs.open(stopwords_file, 'r', encoding='utf-8', errors='ignore') as fp:
      stopwords = fp.read().split('\n')
  return stopwords

In [None]:
stopwords_file     = dataset_dir + "/stopwords-master/cn_stopwords.txt"
stop_words = load_stopwords(stopwords_file)
stop_words

In [20]:
vectorizer = CountVectorizer(stop_words=stop_words)
X_features_train = vectorizer.fit_transform(X_train_cleaned)



In [21]:
lr_classifier = LogisticRegression(solver='liblinear')
lr_classifier.fit(X_features_train, y_train_cleaned)

In [24]:
words_df = pd.DataFrame(X_features_train.toarray(),
                        columns=vectorizer.get_feature_names_out())
words_df

Unnamed: 0,00,001,007,04,05,06,07,08,09,10,...,龙井,龙凤胎,龙柯来,龙潭虎穴,龙王庙,龙舌兰,龙虾,龙门,龙颜大悦,龟派
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
218,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
def most_informative_features(vectorizer, classifier, n=20):
    # Adapted from https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers#11116960
    feature_names       = vectorizer.get_feature_names_out()
    coefs_with_features = sorted(zip(classifier.coef_[0], feature_names))
    top                 = zip(coefs_with_features[:n], coefs_with_features[:-(n + 1):-1])
    for (coef_1, feature_1), (coef_2, feature_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, feature_1, coef_2, feature_2))

In [26]:
# Show which features have the highest-value logistic regression coefficients
print("Most informative features")
most_informative_features(vectorizer, lr_classifier, 10)

Most informative features
	-0.2808	一起             		0.2723	问题             
	-0.2692	起诉             		0.2406	尊重             
	-0.2672	不了             		0.2402	每天             
	-0.2647	摩托车            		0.2305	手机             
	-0.2607	总裁             		0.2234	不能             
	-0.2558	记不住            		0.2125	这人             
	-0.2528	学生             		0.1912	真的             
	-0.2491	张盛             		0.1882	看来             
	-0.2457	房子             		0.1877	喜欢             
	-0.2252	回去             		0.1855	开心             


In [27]:
X_valid_features =  vectorizer.transform(X_valid_cleaned)

In [33]:
print("Classifying test data")
predicted_labels = lr_classifier.predict(X_valid_features)
print('Accuracy  = {}'.format(metrics.accuracy_score(predicted_labels,  y_valid_cleaned)))
for label in [0, 1]:
    print('Precision for label {} = {}'.format(label, metrics.precision_score(predicted_labels, y_valid_cleaned, pos_label=label)))
    print('Recall    for label {} = {}'.format(label, metrics.recall_score(predicted_labels,    y_valid_cleaned, pos_label=label)))


Classifying test data
Accuracy  = 0.45454545454545453
Precision for label 0 = 0.3333333333333333
Recall    for label 0 = 0.6363636363636364
Precision for label 1 = 0.6666666666666666
Recall    for label 1 = 0.36363636363636365
