In [263]:
HOME_DIR = "/home_remote"
HOME = "/home/thi.tra.my.nguyen"

from liwc import Liwc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize, ngrams

In [264]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)

In [265]:
def construct_liwc_input(df, label):
  """
  params: df - The positive/negative dataframe loaded from pickle
    The df is expected to has these columns "Title", "Date", "Text", "SubjectId"
  params: label - The label need to be assigned to result dataframe

  returns: A dataframe contains "SubjectId", "AverageLength", "Text", "NumOfWritings", "Title"
  """
  subject_id_list = df.loc[:, "TrainSubjectId"].unique()
  df["Token"] = df["Text"].apply(lambda x: word_tokenize(x))

  df['text'] = df['Text']+ df['Title']

  grouped_by_subject_id = df.groupby('TrainSubjectId')

  # calculate average token length for each user
  average_length_df = grouped_by_subject_id['Token'].apply(lambda token_series: sum(len(token) for token in token_series) / len(token_series)).reset_index()
  average_length_df.rename(columns={'Token': 'AverageLength'}, inplace=True)
  #print(average_length_df.head())

  # join all writings of single user into single corpus
  joined_text_df = grouped_by_subject_id['text'].apply(' '.join).reset_index()

  # calculate number of writings for each user
  number_of_writings_df = grouped_by_subject_id['Text'].apply(lambda x: len(x)).reset_index()
  number_of_writings_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  result_df = average_length_df.merge(joined_text_df, on="TrainSubjectId")
  result_df = result_df.merge(number_of_writings_df, on="TrainSubjectId")
  result_df["Label"] = label

  return result_df

In [266]:
# construct liwc input for positive and negative dataframe
input_positives = construct_liwc_input(positives, 1)
input_negatives = construct_liwc_input(negatives, 0)

In [267]:
#concatenate two dataframe and shuffle
liwc_input = pd.concat([input_positives, input_negatives])
liwc_input = liwc_input.sample(frac=1).reset_index(drop=True)

In [268]:
liwc_input

Unnamed: 0,TrainSubjectId,AverageLength,text,NumOfWritings,Label
0,train_subject6828,20.122711,Have you seen Bloodline? That first season is...,1092,0
1,train_subject8603,21.882812,I met KC Green about 5 years ago and asked ...,256,0
2,train_subject5173,28.108974,Thanks for sharing and I can imagine that! Es...,156,0
3,train_subject1637,32.000000,Like sliggoo? Common guys it's not fat green ...,12,1
4,train_subject2006,67.378981,I'm in enthusiastic agreement with you O...,314,0
...,...,...,...,...,...
481,train_subject634,35.316327,He got 2nd place in Evo 2012 too :( Gamerbee...,98,0
482,train_subject5276,21.626087,also i need a name for the disputed territory...,230,0
483,train_subject138,32.602005,The boozing starts from 7am. Though large a...,1995,0
484,train_subject3364,37.688525,Cat. Thank you for watching out for the...,122,1


### LIWC Features

In [269]:
# Load LIWC dictionary
liwc = Liwc(os.path.join(HOME_DIR, "master_thesis/LIWC2007_English100131.dic"))
input = [liwc.parse(word_tokenize(text)) for text in liwc_input['text']]

In [271]:
# Add AverageLength and NumOfWritings to the vector
def add_to_counter(counter, key, value):
    counter[key] = value
    return counter

# Get features
def get_features(df, output):
    df['vector'] = output
    average_length = df['AverageLength']
    num_of_writings = df['NumOfWritings']
    vector = df['vector']
    for i in range(len(vector)):
        vector[i] = add_to_counter(vector[i], "AverageLength", average_length[i])
        vector[i] = add_to_counter(vector[i], "NumOfWritings", num_of_writings[i])
    df['vector_added'] = vector
    vector_df = pd.DataFrame(df['vector_added'].tolist(), index=df.index)
    vector_df_norm = (vector_df - vector_df.min()) / (vector_df.max() - vector_df.min())
    vector_df_norm['Label'] = df['Label']
    vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']
    vector_df_norm = vector_df_norm.fillna(0)
    corr = vector_df_norm.corr()
    corr_label = corr['Label'].sort_values(ascending=False)
    relevant_features = corr_label[1:25]
    relevant_features_name = relevant_features.index.values
    X = vector_df_norm[relevant_features_name]
    y = vector_df_norm['Label']
    return X, y


In [272]:
X_liwc, y_liwc = get_features(liwc_input, op)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "AverageLength", average_length[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "NumOfWritings", num_of_writings[i])


### LIWC-alike

In [273]:
%run /home_remote/master_thesis/model_evaluation/liwc_alike.py

In [274]:
liwc_alike_output = [main(text, result) for text in liwc_input['text']]


In [275]:
X_liwc_alike, y_liwc_alike = get_features(liwc_input, liwc_alike_output)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "AverageLength", average_length[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "NumOfWritings", num_of_writings[i])


## Models

In [277]:
def logistic_regression(X, y):

    w = [1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6,2**7, 2**8]
    weight = [{0: 1/(1+x),  1: x/(1+x)} for x in w]
    C = [2**-6, 2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6]
    # define grid search
    hyperparam_grid = {"class_weight": weight
                    ,"penalty": ["l1", "l2"]
                    ,"C": C
                    ,"fit_intercept": [True, False]  }
    # define evaluation procedure
    cv = KFold(n_splits=10, shuffle=True, random_state=13)
    # define grid search
    model_test = LogisticRegression(solver='liblinear')
    grid = GridSearchCV(estimator=model_test, param_grid=hyperparam_grid, cv=cv, scoring='roc_auc')
    grid_result = grid.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    #build a model with the best parameters, fix class_weight = (0.2, 0.8)

    model = LogisticRegression(**grid_result.best_params_)
    model.fit(X, y)
    return model