In [None]:
def construct_liwc_input(df, label):
  """
  params: df - The positive/negative dataframe loaded from pickle
    The df is expected to has these columns "Title", "Date", "Text", "SubjectId"
  params: label - The label need to be assigned to result dataframe

  returns: A dataframe contains "SubjectId", "AverageLength", "Text", "NumOfWritings"
  """
  subject_id_list = df.loc[:, "SubjectId"].unique()
  df["Token"] = df["Text"].apply(lambda x: word_tokenize(x))

  grouped_by_subject_id = df.groupby('SubjectId')

  # calculate average token length for each user
  average_length_df = grouped_by_subject_id['Token'].apply(lambda token_series: sum(len(token) for token in token_series) / len(token_series)).reset_index()
  average_length_df.rename(columns={'Token': 'AverageLength'}, inplace=True)

  # join all writings of single user into single corpus
  joined_text_df = grouped_by_subject_id['Text'].apply(' '.join).reset_index()

  # calculate number of writings for each user
  number_of_writings_df = grouped_by_subject_id['Text'].apply(lambda x: len(x)).reset_index()
  number_of_writings_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  result_df = average_length_df.merge(joined_text_df, on="SubjectId")
  result_df = result_df.merge(number_of_writings_df, on="SubjectId")
  result_df["Label"] = label

  return result_df