In [263]:
HOME_DIR = "/home_remote"
HOME = "/home/thi.tra.my.nguyen"

from liwc import Liwc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize, ngrams
from sklearn.ensemble import RandomForestClassifier

In [49]:
relevant_features_name_without_Length ={'liwc': ['i', 'friend', 'sad', 'family', 'feel', 'health',
       'sexual', 'anx', 'body', 'bio', 'ppron', 'filler', 'shehe', 'adverb',
       'swear', 'humans', 'excl', 'assent', 'discrep', 'you', 'pronoun',
       'negemo', 'past'],
                        'liwc_alike': ['Anxiety', 'I', 'Sadness', 'Affective Processes',
       'Sexuality', 'Family', 'Friends', 'Fillers', 'Health', 'Feeling',
       'Humans', 'Biological Processes', 'Time', 'Body', 'Negative Emotions',
       'Social Processes', 'Perceptual Processes', 'Insight',
       'Cognitive Processes', 'Motion', 'Positive Emotions', 'Tentative',
       'Ppronouns']}

In [264]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)

In [265]:
def construct_liwc_input(df, label):
  """
  params: df - The positive/negative dataframe loaded from pickle
    The df is expected to has these columns "Title", "Date", "Text", "SubjectId"
  params: label - The label need to be assigned to result dataframe

  returns: A dataframe contains "SubjectId", "AverageLength", "Text", "NumOfWritings", "Title"
  """
  subject_id_list = df.loc[:, "TrainSubjectId"].unique()
  df["Token"] = df["Text"].apply(lambda x: word_tokenize(x))

  df['text'] = df['Text']+ df['Title']

  grouped_by_subject_id = df.groupby('TrainSubjectId')

  # calculate average token length for each user
  average_length_df = grouped_by_subject_id['Token'].apply(lambda token_series: sum(len(token) for token in token_series) / len(token_series)).reset_index()
  average_length_df.rename(columns={'Token': 'AverageLength'}, inplace=True)
  #print(average_length_df.head())

  # join all writings of single user into single corpus
  joined_text_df = grouped_by_subject_id['text'].apply(' '.join).reset_index()

  # calculate number of writings for each user
  number_of_writings_df = grouped_by_subject_id['Text'].apply(lambda x: len(x)).reset_index()
  number_of_writings_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  result_df = average_length_df.merge(joined_text_df, on="TrainSubjectId")
  result_df = result_df.merge(number_of_writings_df, on="TrainSubjectId")
  result_df["Label"] = label

  return result_df

In [266]:
# construct liwc input for positive and negative dataframe
input_positives = construct_liwc_input(positives, 1)
input_negatives = construct_liwc_input(negatives, 0)

In [267]:
#concatenate two dataframe and shuffle
liwc_input = pd.concat([input_positives, input_negatives])
liwc_input = liwc_input.sample(frac=1).reset_index(drop=True)

In [268]:
liwc_input

Unnamed: 0,TrainSubjectId,AverageLength,text,NumOfWritings,Label
0,train_subject6828,20.122711,Have you seen Bloodline? That first season is...,1092,0
1,train_subject8603,21.882812,I met KC Green about 5 years ago and asked ...,256,0
2,train_subject5173,28.108974,Thanks for sharing and I can imagine that! Es...,156,0
3,train_subject1637,32.000000,Like sliggoo? Common guys it's not fat green ...,12,1
4,train_subject2006,67.378981,I'm in enthusiastic agreement with you O...,314,0
...,...,...,...,...,...
481,train_subject634,35.316327,He got 2nd place in Evo 2012 too :( Gamerbee...,98,0
482,train_subject5276,21.626087,also i need a name for the disputed territory...,230,0
483,train_subject138,32.602005,The boozing starts from 7am. Though large a...,1995,0
484,train_subject3364,37.688525,Cat. Thank you for watching out for the...,122,1


### LIWC Features

In [269]:
# Load LIWC dictionary
liwc = Liwc(os.path.join(HOME_DIR, "master_thesis/LIWC2007_English100131.dic"))
input = [liwc.parse(word_tokenize(text)) for text in liwc_input['text']]

In [271]:
# Add AverageLength and NumOfWritings to the vector
def add_to_counter(counter, key, value):
    counter[key] = value
    return counter

# Get features
def get_features(df, output):
    df['vector'] = output
    average_length = df['AverageLength']
    num_of_writings = df['NumOfWritings']
    vector = df['vector']
    for i in range(len(vector)):
        vector[i] = add_to_counter(vector[i], "AverageLength", average_length[i])
        vector[i] = add_to_counter(vector[i], "NumOfWritings", num_of_writings[i])
    df['vector_added'] = vector
    vector_df = pd.DataFrame(df['vector_added'].tolist(), index=df.index)
    vector_df_norm = (vector_df - vector_df.min()) / (vector_df.max() - vector_df.min())
    vector_df_norm['Label'] = df['Label']
    vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']
    vector_df_norm = vector_df_norm.fillna(0)
    corr = vector_df_norm.corr()
    corr_label = corr['Label'].sort_values(ascending=False)
    relevant_features = corr_label[1:25]
    relevant_features_name = relevant_features.index.values
    X = vector_df_norm[relevant_features_name]
    y = vector_df_norm['Label']
    return X, y


In [272]:
X_liwc, y_liwc = get_features(liwc_input, op)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "AverageLength", average_length[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "NumOfWritings", num_of_writings[i])


### LIWC-alike

In [273]:
%run /home_remote/master_thesis/model_evaluation/liwc_alike.py

In [274]:
liwc_alike_output = [main(text, result) for text in liwc_input['text']]


In [275]:
X_liwc_alike, y_liwc_alike = get_features(liwc_input, liwc_alike_output)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "AverageLength", average_length[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vector[i] = add_to_counter(vector[i], "NumOfWritings", num_of_writings[i])


## Models

In [369]:
def logistic_regression(X, y):

    w = [1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6,2**7, 2**8]
    weight = [{0: 1/(1+x),  1: x/(1+x)} for x in w]
    C = [2**-6, 2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6]
    # define grid search
    hyperparam_grid = {"class_weight": weight
                    ,"C": C
                    ,"fit_intercept": [True, False]  }
    # define evaluation procedure
    cv = KFold(n_splits=5, shuffle=True, random_state=13)
    # define grid search
    model_test = LogisticRegression(solver='liblinear', penalty='l1')
    grid = GridSearchCV(estimator=model_test, param_grid=hyperparam_grid, cv=cv, scoring='f1')
    grid_result = grid.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    #build a model with the best parameters, fix class_weight = (0.2, 0.8)

    model = LogisticRegression(**grid_result.best_params_)
    model.fit(X, y)
    return model

In [371]:
mod = logistic_regression(X_liwc, y_liwc)
#cross_validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(mod, X_liwc, y_liwc, cv=5, scoring='f1')
print(scores)

Best: 0.334819 using {'C': 16, 'class_weight': {0: 0.2, 1: 0.8}, 'fit_intercept': True}
[0.25       0.07692308 0.         0.4375     0.28571429]


In [372]:
def random_forest(X, y):

    w = [1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6,2**7, 2**8]
    weight = [{0: 1/(1+x),  1: x/(1+x)} for x in w]
    #C = [2**-6, 2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6]
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # define grid search
    hyperparam_grid = {"class_weight": weight,
                        'n_estimators': n_estimators,
                        'max_features': max_features,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap}
    # define evaluation procedure
    cv = KFold(n_splits=5, shuffle=True, random_state=13)
    # define grid search
    model_test = RandomForestClassifier()
    grid = GridSearchCV(estimator=model_test, param_grid=hyperparam_grid, cv=cv, scoring='f1')
    grid_result = grid.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    #build a model with the best parameters, fix class_weight = (0.2, 0.8)

    model = RandomForestClassifier(**grid_result.best_params_)
    model.fit(X, y)
    return model

In [50]:
#get extract
def extract_feature(df, output, type):
    # Assign the 'output' to a new column 'vector' in the DataFrame 'df'.
    df['vector'] = output
    
    # Create a new DataFrame 'vector_df' containing the 'vector' values, with the same index as 'df'.
    vector_df = pd.DataFrame(df['vector'].tolist(), index=df.index)
    
    # Normalize the values in 'vector_df' by dividing each row by the sum of its values.
    vector_df_norm = vector_df.div(vector_df.sum(axis=1), axis=0)

    # Assign 'Label' and 'TrainSubjectId' columns from 'df' to 'vector_df_norm'.
    vector_df_norm['Label'] = df['Label']
    vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']

    # Fill any NaN (Not-a-Number) values with 0.
    vector_df_norm = vector_df_norm.fillna(0)

    # Create feature matrix 'X' from the relevant features based on 'type'.
    X = vector_df_norm[relevant_features_name_without_Length[type]]

    # Create the target variable 'y' from the 'Label' column in 'vector_df_norm'.
    y = vector_df_norm['Label']

    # Return the feature matrix 'X' and the target variable 'y'.
    return X, y


In [51]:
data_input = liwc_input[['TrainSubjectId', 'Label', 'text']]

In [52]:
X_liwc_alike2, y_liwc_alike2 = extract_feature(data_input, liwc_alike_output, 'liwc_alike')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['vector'] = output


In [53]:
X_liwc_alike2

Unnamed: 0,Anxiety,I,Sadness,Affective Processes,Sexuality,Family,Friends,Fillers,Health,Feeling,...,Body,Negative Emotions,Social Processes,Perceptual Processes,Insight,Cognitive Processes,Motion,Positive Emotions,Tentative,Ppronouns
0,0.000520,0.010825,0.000396,0.001982,0.001164,0.000991,0.003691,0.000074,0.001858,0.006218,...,0.002799,0.004261,0.022022,0.012411,0.017266,0.014046,0.011395,0.016300,0.019347,0.062376
1,0.000449,0.018402,0.000449,0.002020,0.000449,0.013689,0.005162,0.000000,0.002469,0.008079,...,0.003815,0.003366,0.042415,0.006732,0.006732,0.004264,0.010772,0.007630,0.010996,0.069569
2,0.000192,0.011690,0.000958,0.003833,0.001342,0.001342,0.002491,0.000000,0.001725,0.004791,...,0.001342,0.007666,0.022614,0.017440,0.017440,0.011882,0.014565,0.016290,0.023956,0.056535
3,0.000622,0.006970,0.000373,0.001494,0.000747,0.000747,0.001245,0.000249,0.001743,0.007344,...,0.002365,0.003236,0.048419,0.011949,0.017924,0.013443,0.012945,0.017924,0.023152,0.063231
4,0.000000,0.008136,0.000740,0.000740,0.000000,0.001479,0.001479,0.000000,0.000740,0.005917,...,0.000740,0.008876,0.025148,0.009615,0.011095,0.005178,0.015533,0.017012,0.019970,0.055473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,0.000536,0.005893,0.000134,0.001875,0.000134,0.000670,0.001339,0.000000,0.001071,0.003348,...,0.001741,0.003616,0.022097,0.015669,0.025847,0.020892,0.017544,0.013928,0.028659,0.053703
482,0.000233,0.003502,0.000233,0.001109,0.000642,0.001692,0.001692,0.000000,0.002510,0.003794,...,0.000642,0.004085,0.028247,0.013832,0.022265,0.017479,0.013190,0.007996,0.025008,0.062360
483,0.000911,0.011577,0.000375,0.002090,0.001608,0.001769,0.001608,0.000000,0.001179,0.008200,...,0.001233,0.003752,0.029800,0.017472,0.023636,0.016722,0.013935,0.017472,0.031568,0.057026
484,0.000446,0.001338,0.000892,0.001338,0.001189,0.001041,0.000297,0.000000,0.002676,0.007434,...,0.001933,0.007880,0.020220,0.016057,0.015611,0.011002,0.018733,0.009515,0.019923,0.023937
