In [1]:
HOME_DIR = "/home_remote"
HOME = "/home/thi.tra.my.nguyen"

from liwc import Liwc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize, ngrams
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
#import logisitic_regression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

### Data preparation

In [4]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)

In [2]:
def construct_liwc_input(df, label):
  """
  params: df - The positive/negative dataframe loaded from pickle
    The df is expected to has these columns "Title", "Date", "Text", "SubjectId"
  params: label - The label need to be assigned to result dataframe

  returns: A dataframe contains "SubjectId", "AverageLength", "Text", "NumOfWritings", "Title"
  """
  subject_id_list = df.loc[:, "TrainSubjectId"].unique()
  df["Token"] = df["Text"].apply(lambda x: word_tokenize(x))

  df['text'] = df['Text']+ df['Title']

  grouped_by_subject_id = df.groupby('TrainSubjectId')

  # calculate average token length for each user
  average_length_df = grouped_by_subject_id['Token'].apply(lambda token_series: sum(len(token) for token in token_series) / len(token_series)).reset_index()
  average_length_df.rename(columns={'Token': 'AverageLength'}, inplace=True)
  #print(average_length_df.head())

  # join all writings of single user into single corpus
  joined_text_df = grouped_by_subject_id['text'].apply(' '.join).reset_index()

  # calculate number of writings for each user
  number_of_writings_df = grouped_by_subject_id['Text'].apply(lambda x: len(x)).reset_index()
  number_of_writings_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  result_df = average_length_df.merge(joined_text_df, on="TrainSubjectId")
  result_df = result_df.merge(number_of_writings_df, on="TrainSubjectId")
  result_df["Label"] = label

  return result_df

In [5]:
input_positives = construct_liwc_input(positives, 1)
input_negatives = construct_liwc_input(negatives, 0)

In [6]:
liwc_input = pd.concat([input_positives, input_negatives])
liwc_input = liwc_input.sample(frac=1).reset_index(drop=True)

In [7]:
data_input = liwc_input[['TrainSubjectId', 'Label', 'text']]

### Feature Extraction

In [8]:
# Load LIWC dictionary
liwc = Liwc(os.path.join(HOME_DIR, "master_thesis/LIWC2007_English100131.dic"))
input = [liwc.parse(word_tokenize(text)) for text in data_input['text']]

In [9]:
# Load LIWC-alike dictionary
%run /home_remote/master_thesis/model_evaluation/liwc_alike.py
liwc_alike_output = [main(text, result) for text in data_input['text']]

In [None]:
#extract top 15 features based on percentage. 
def get_features_top15(df, output):
    #df['vector'] = output
    vector_df = pd.DataFrame(output, index=df.index)
    vector_df_norm = vector_df.div(vector_df.sum(axis=1), axis=0)
    vector_df_norm['Label'] = df['Label']
    vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']
    vector_df_norm = vector_df_norm.fillna(0)
    corr = vector_df_norm.corr()
    corr_label = corr['Label'].sort_values(ascending=False)
    relevant_features = corr_label[1:16]
    relevant_features_name = relevant_features.index.values
    X = vector_df_norm[relevant_features_name]
    y = vector_df_norm['Label']
    return X, y

In [18]:
a = pd.DataFrame(liwc_alike_output, index=data_input.index)
b = a.div(a.sum(axis=1), axis=0)
b = b.fillna(0)

In [19]:
b.iloc[0,:]

auxverb                 0.050534
Cognitive Processes     0.023810
Insight                 0.032718
Inclusive               0.050858
Work                    0.050373
Perceptual Processes    0.018626
Assent                  0.009718
Pronouns                0.092485
ipron                   0.047133
Causation               0.028993
Past tense              0.011014
Seeing                  0.089893
Feeling                 0.007937
Articles                0.043408
Certainty               0.013929
Ppronouns               0.047943
I                       0.015225
Prepositions            0.045999
Tentative               0.024943
Exclusive               0.011824
Affective Processes     0.001782
Negative Emotions       0.005507
Achievement             0.006479
Social Processes        0.028993
Friends                 0.002915
Time                    0.001782
Relativity              0.006641
Motion                  0.012958
Present tense           0.051344
Discrepancy             0.010042
Negations 

In [75]:
#save pca model
pca = PCA(n_components=15)
pca.fit(b)
joblib.dump(pca, os.path.join(HOME_DIR,'pca.pkl'))

['/home_remote/pca.pkl']

In [74]:
joblib.dump(test_model, os.path.join(HOME_DIR,'liwc_alike_pca.pkl'))

['/home_remote/liwc_alike_pca.pkl']

In [79]:
c = pca.transform(b)

In [81]:
y_pred = test_model.predict(c)

In [82]:
#precision, recall, fscore
print(classification_report(data_input['Label'], y_pred))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91       403
           1       0.55      0.84      0.67        83

    accuracy                           0.86       486
   macro avg       0.76      0.85      0.79       486
weighted avg       0.89      0.86      0.87       486



In [98]:
test_model.predict_proba(c[4].reshape(1, -1))[0][1]

0.8597251824180162

### LSTM

In [99]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Example text data (replace this with your actual text data)
texts = data_input['text'].tolist()
# Create a tokenizer and fit it on your text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Get the vocabulary size
vocabulary_size = len(tokenizer.word_index) + 1  # Add 1 for the special padding token if used

print("Vocabulary Size:", vocabulary_size)

Vocabulary Size: 165172


In [101]:
X_train, X_test, y_train, y_test = train_test_split(c, data_input['Label'], test_size=0.2, random_state=13)

In [106]:
#lstm 
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler

# Define the model
model2 = Sequential()

# Embedding layer
model2.add(Embedding(input_dim = 15 , output_dim =128, sequential_length=15))

# LSTM layer with ReLU activation and dropout
model2.add(LSTM(128, activation='relu', dropout=0.3, recurrent_dropout=0.3))

# Fully connected layer with ReLU activation
model2.add(Dense(128, activation='relu', kernel_regularizer='l2', bias_regularizer='l2'))

# Dropout layer
model2.add(Dropout(0.3))

# Output layer with softmax activation for binary classification
model2.add(Dense(1, activation='sigmoid'))

# Compile the model
model2.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-4), metrics=['accuracy'])

# Learning rate decay scheduler
def lr_scheduler(epoch, lr):
    return lr * np.exp(-1e-5 * epoch)

lr_callback = LearningRateScheduler(lr_scheduler)

# Train the model
model2.fit(X_train, y_train, epochs=130, batch_size=100, callbacks=[lr_callback], class_weight={0: 0.2, 1: 0.8})

2023-11-13 23:21:53.172728: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 243.0KiB (rounded to 248832)requested by op RandomUniform
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-11-13 23:21:53.172827: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2023-11-13 23:21:53.172868: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 4, Chunks in use: 3. 1.0KiB allocated for chunks. 768B in use in bin. 16B client-requested in use in bin.
2023-11-13 23:21:53.172894: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-11-13 23:21:53.172916: I tensorflow/core/common_

ResourceExhaustedError: OOM when allocating tensor with shape[486,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]