# NLP Mafia Project (Logistic Regression)

Here, we implement a Logistic Regression model using Scikit-Learn that serves as a Baseline model for the NLP Mafia project.

In [1]:
# Cloning into the repository to obtain files
!git clone https://bitbucket.org/bopjesvla/thesis.git

Cloning into 'thesis'...
remote: Counting objects: 220, done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 220 (delta 19), reused 0 (delta 0)[K
Receiving objects: 100% (220/220), 734.99 MiB | 16.22 MiB/s, done.
Resolving deltas: 100% (96/96), done.
Checking out files: 100% (58/58), done.


In [2]:
!cp thesis/src/* .

cp: -r not specified; omitting directory 'thesis/src/output'


## Import the required modules

In [17]:
# Install dependency
!pip install googledrivedownloader



In [0]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from google_drive_downloader import GoogleDriveDownloader as gdd

In [19]:
# Download the pickle file from the Google Drive link and store it on Colab
gdd.download_file_from_google_drive(file_id = '1-6uBfJVCAUnPua4zmu_wf3eKWEbc97Zu',
                                    dest_path = './thesis/src/24h_of_deception_full_features.pkl',
                                    unzip = False,
                                    showsize = True)

# Read in the data from the pkl file and reset the indices
# in order to get rid of any indices that might be empty
docs = pd.read_pickle('./thesis/src/24h_of_deception_full_features.pkl')
docs = docs.reset_index()
docs

Unnamed: 0,index,author,game_id,post_day,content,inserted_at,updated_at,scum,slot_id,event,words,wc,punc,or_ratio,neg_em_ratio,anger_ratio,sensory_ratio,cog_ratio,insight_ratio,motion_ratio,tent_ratio,not_ratio,fpp_ratio,spp_ratio,tpp_ratio,quant_ratio,sentence_length,but_ratio,token_length,unique_tokens,message_count,message_length,time,wc24h,msg24h,vector_FastText_fasttext-wiki-news-subwords-300,vector_GloVe_glove-wiki-gigaword-200
0,1,My Milked Eek,17378,1,VOTE: lobstermania /!@ Let's pick up the pace ...,2011-05-03 15:51:00,2011-05-03 19:25:00,False,7736,endgamed Day 3,"[vote, lobstermania, let, s, pick, up, the, pa...",171,"[:, /!@, ', ., :, -, "", ', "", ?, -, (, ), ?, :...",0.005848,0.005848,0.000000,0.017544,0.081871,0.029240,0.000000,0.058480,0.029240,0.029240,0.087719,0.011696,0.023392,0.152047,0.000000,4.491228,0.684211,6,28.500000,1.148611,148.875453,5.223700,"[0.00014883353, 0.014454731, 0.013349839, 0.00...","[0.18914586, 0.255794, -0.09664429, -0.1206634..."
1,2,My Milked Eek,17378,2,\nIt's called leaving some wiggling room.\n\n\...,2011-05-04 16:03:00,2011-05-04 16:03:00,False,7736,endgamed Day 3,"[it, s, called, leaving, some, wiggling, room,...",98,"[', ., ', ', ,, ., ., ,, ', ', ., /, ,, @, ., ...",0.010204,0.000000,0.000000,0.020408,0.091837,0.061224,0.000000,0.040816,0.040816,0.030612,0.020408,0.040816,0.020408,0.061224,0.000000,4.000000,0.775510,1,98.000000,1.000000,98.000000,1.000000,"[0.0034755077, 0.0059258644, 0.018331366, -0.0...","[0.15384091, 0.16181515, -0.103125125, -0.1256..."
2,3,My Milked Eek,17378,3,\nI do not see him accusing don and myself of ...,2011-05-05 16:15:00,2011-05-05 19:40:00,False,7736,endgamed Day 3,"[i, do, not, see, him, accusing, don, and, mys...",261,"["", ""., -, ., ,, ', -, ., ,, ., ,, ., :, ,, ',...",0.007663,0.015326,0.003831,0.011494,0.053640,0.011494,0.011494,0.049808,0.045977,0.034483,0.007663,0.053640,0.019157,0.103448,0.007663,4.034483,0.582375,3,87.000000,1.142361,228.474164,2.626140,"[0.0026843122, 0.014308665, 0.018803054, 0.007...","[0.14101559, 0.19325556, -0.06109542, -0.15930..."
3,4,My Milked Eek,17378,4,\nI'd think that someone of your experience sh...,2011-05-06 17:54:00,2011-05-07 09:58:00,False,7736,endgamed Day 3,"[i, d, think, that, someone, of, your, experie...",1110,"[', ., /!@, /, ?, ., ., ., ., /!@, :, ', ., .,...",0.004505,0.005405,0.001802,0.016216,0.083784,0.024324,0.005405,0.018018,0.039640,0.057658,0.023423,0.029730,0.009009,0.080180,0.009910,3.910811,0.344144,9,123.333333,1.669444,664.891847,5.391015,"[2.2882961e-05, 0.007481162, 0.01877277, 0.009...","[0.13120885, 0.23477675, -0.09023106, -0.17432..."
4,5,My Milked Eek,17378,5,A reason to a vote != a case behind a vote. No...,2011-05-07 12:52:00,2011-05-08 08:34:00,False,7736,endgamed Day 3,"[a, reason, to, a, vote, a, case, behind, a, v...",229,"[!=, ., -, ., ., ., ., ., ., ., ., ., ., ', ,,...",0.004367,0.008734,0.000000,0.008734,0.091703,0.048035,0.017467,0.013100,0.034934,0.034934,0.017467,0.043668,0.004367,0.126638,0.004367,3.921397,0.554585,3,76.333333,1.820833,125.766590,1.647597,"[0.0036013017, 0.018087918, 0.01853675, 0.0086...","[0.12107126, 0.20229767, -0.032822255, -0.2123..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112518,166137,pieguyn,64645,4,i think there's about a 0% GR was shot. no of...,2016-02-16 08:59:00,2016-02-16 22:07:00,False,3233,survived,"[i, think, there, s, about, a, 0, gr, was, sho...",198,"[', %, ., ', ', (, -, ', ', )., ', ,, /, ., ',...",0.010101,0.040404,0.020202,0.015152,0.126263,0.025253,0.000000,0.030303,0.055556,0.035354,0.030303,0.010101,0.000000,0.070707,0.010101,3.545455,0.560606,4,49.500000,1.547222,127.971275,2.585278,"[0.009871641, -0.0055486066, 0.030474463, 0.02...","[0.16473468, 0.2107324, -0.17985842, -0.087528..."
112519,166138,pieguyn,64645,6,vote: Golden Robster (L-1). /!@ fire needs to ...,2016-02-18 04:27:00,2016-02-18 20:11:00,False,3233,survived,"[vote, golden, robster, l, 1, fire, needs, to,...",96,"[:, (, -, )., /!@, /!@, ., ', ., ,, ', ', ,, -...",0.000000,0.000000,0.000000,0.010417,0.072917,0.020833,0.000000,0.031250,0.010417,0.052083,0.000000,0.031250,0.000000,0.072917,0.000000,3.895833,0.770833,4,24.000000,1.655556,57.986577,2.416107,"[-0.00030726963, 0.004243611, 0.031180447, 0.0...","[0.19487478, 0.17722246, -0.12213767, -0.14003..."
112520,166139,pieguyn,64645,7,my issue with the Zulfy interactions is more t...,2016-02-19 11:20:00,2016-02-19 11:20:00,False,3233,survived,"[my, issue, with, the, zulfy, interactions, is...",176,"[-, ., ,, ,, ,, ,, ,, ., "", "", ., ,, ,, ', ., ...",0.000000,0.017045,0.000000,0.039773,0.113636,0.051136,0.005682,0.022727,0.034091,0.034091,0.000000,0.068182,0.034091,0.034091,0.000000,4.034091,0.613636,1,176.000000,1.000000,176.000000,1.000000,"[-0.0051517687, 0.002170852, 0.014490488, 0.00...","[0.1343826, 0.11695806, -0.16665061, -0.216180..."
112521,166140,Political Clout,64645,0,\n\nIt looks like scum started killing off the...,2016-02-12 01:29:00,2016-02-12 18:22:00,False,3235,survived,"[it, looks, like, scum, started, killing, off,...",171,"[., -, ., ', ., @, ?, ', ., @, ., ., ', /, ., ...",0.000000,0.011696,0.005848,0.035088,0.087719,0.029240,0.017544,0.040936,0.017544,0.058480,0.005848,0.017544,0.029240,0.081871,0.011696,3.982456,0.695906,3,57.000000,1.703472,100.383204,1.761109,"[-0.0021163446, 0.0034471503, 0.013574303, 0.0...","[0.13204151, 0.2174588, -0.11830967, -0.187989..."


## Instantiate Model

Here, we instantiate a Logistic Regression model that will serve as our baseline and specify the number of splits of the dataset that will be used for the Cross Validation. A 20-fold Cross Validation will be used here. Also, the two classes (Mafia & Not Mafia) are re-weighted to account for the class imbalance.

In [0]:
# Instantiate a model and specify the number of splits of the
# dataset that we will be using. In this case, we will be doing 
# a 20-fold Cross Validation on the dataset. Since

model = LogisticRegression(class_weight = {0: 0.25, 1: 0.75}, max_iter = 300)

kf = KFold(n_splits = 20)

## Comparison of various models

### 1. Hand picked features + FastText

The hand picked features that we will be using here in addition to the FastText Wiki vector are the following:

- Messages / 24 Hours
- 3rd Person Pronoun Ratio
- 2nd Person Pronoun Ratio
- Sentence Length

Of the above 4 features, Sentence Length and 3rd Person Pronoun Ratio have been found to match the effect of detecting deception based on the meta-analysis done by Hauch et al. (2015). Also, it has been seen in Ruiter and Kachergis (2018) that the message frequency is negatively correslated with deception.




In [0]:
# Hand picked features + FastText Wiki
msg24h = docs['msg24h'].values
tpp_ratio = docs['tpp_ratio'].values
spp_ratio = docs['spp_ratio'].values
sentencelength = docs['sentence_length'].values
vectorFastTextwiki = docs['vector_FastText_fasttext-wiki-news-subwords-300'].values
vectorGloVewiki = docs['vector_GloVe_glove-wiki-gigaword-200'].values

# Input
input_combined = np.hstack([sentencelength.reshape(-1, 1), msg24h.reshape(-1, 1), tpp_ratio.reshape(-1, 1), 
                            spp_ratio.reshape(-1, 1), np.vstack(vectorFastTextwiki)])

# Labels
scum = docs['scum'].values

In [20]:
# Combining Sentence Length, Messages / 24 Hours, 3rd-Person Pronoun Ratio,
# 2nd-Person Pronoun Ration and FastText

score_final = 0.0
auroc_final = 0.0
ap_final = 0.0

for train_index, test_index in kf.split(input_combined):
    X_train, X_test = input_combined[train_index], input_combined[test_index]
    Y_train, Y_test = scum[train_index], scum[test_index]

    # Train the model on the Training Dataset
    model.fit(X_train, Y_train)

    # Model makes predictions based on input from the Test Dataset
    predictions = model.predict(X_test)

    # Compute the percentage accuracy of the model's predictions
    score = model.score(X_test, Y_test)

    # Compute the AUROC of the model
    auroc = roc_auc_score(Y_test, predictions)

    # Compute the Average Precision of the model
    average_precision = average_precision_score(Y_test, predictions)

    # Stores the above results so as to obtain the mean performance
    # of the model on the total dataset
    score_final += score
    auroc_final += auroc
    ap_final += average_precision

print("Score:", score_final / 20.0)
print("AUROC:", auroc_final / 20.0)
print("Average Precision:", ap_final / 20.0)

Score: 0.5986334613514359
AUROC: 0.5360747337443851
Average Precision: 0.25354560633500073


### 2. FastText Wiki

Here, we use the standalone FastText Wiki vector in order to compare this model's performance with the Hand-picked Features + FastText Wiki vector to see if there is an improvement

In [9]:
score_final = 0.0
auroc_final = 0.0
ap_final = 0.0

for train_index, test_index in kf.split(vectorFastTextwiki):
    X_train, X_test = vectorFastTextwiki[train_index], vectorFastTextwiki[test_index]
    Y_train, Y_test = scum[train_index], scum[test_index]

    # Train the model on the Training Dataset
    model.fit(X_train.tolist(), Y_train.tolist())

    # Model makes predictions based on input from the Test Dataset
    predictions = model.predict(X_test.tolist())

    # Compute the percentage accuracy of the model's predictions
    score = model.score(X_test.tolist(), Y_test.tolist())

    # Compute the AUROC of the model
    auroc = roc_auc_score(Y_test, predictions)

    # Compute the Average Precision of the model
    average_precision = average_precision_score(Y_test, predictions)

    # Stores the above results so as to obtain the mean performance
    # of the model on the total dataset
    score_final += score
    auroc_final += auroc
    ap_final += average_precision

print("Score:", score_final / 20.0)
print("AUROC:", auroc_final / 20.0)
print("Average Precision:", ap_final / 20.0)

Score: 0.6300847552659082
AUROC: 0.5260622602314056
Average Precision: 0.2490564254582855


### 3. GloVe Wiki

We now repeat the above exercise to check if the model with only GloVe vectors performs better than either of the two previous models.

In [0]:
vectorGloVewiki = docs['vector_GloVe_glove-wiki-gigaword-200'].values

In [12]:
score_final_2 = 0.0
auroc_final_2 = 0.0
ap_final_2 = 0.0

for train_index, test_index in kf.split(vectorGloVewiki):
    X_train, X_test = vectorGloVewiki[train_index], vectorGloVewiki[test_index]
    Y_train, Y_test = scum[train_index], scum[test_index]

    # Train the model on the Training Dataset
    model.fit(X_train.tolist(), Y_train.tolist())

    # Model makes predictions based on input from the Test Dataset
    predictions = model.predict(X_test.tolist())

    # Compute the percentage accuracy of the model's predictions
    score = model.score(X_test.tolist(), Y_test.tolist())

    # Compute the AUROC of the model
    auroc = roc_auc_score(Y_test.tolist(), predictions)

    # Compute the Average Precision of the model
    average_precision = average_precision_score(Y_test.tolist(), predictions)

    # Stores the above results so as to obtain the mean performance
    # of the model on the total dataset
    score_final_2 += score
    auroc_final_2 += auroc
    ap_final_2 += average_precision

print("Score:", score_final_2 / 20.0)
print("AUROC:", auroc_final_2 / 20.0)
print("Average Precision:", ap_final_2 / 20.0)

Score: 0.5988910448461787
AUROC: 0.5271335154646739
Average Precision: 0.24931807632697334


## References

- Hauch, V., Blandon-Gitlin, I., Masip, J. and Sporer, S.L., 2015. Are computers effective lie detectors? A meta-analysis of linguistic cues to deception. *Personality and social psychology Review, 19*(4), pp. 307-342.

- de Ruiter, B. and Kachergis, G., 2018. The Mafiascum Dataset: A Large Text Corpus for Deception Detection. *arXiv preprint arXiv:1811.07851.*