# NLP Mafia Project (Logistic Regression)

Here, we implement a Logistic Regression model using Scikit-Learn that serves as a Baseline model for the NLP Mafia project.

In [1]:
# Cloning into the repository to obtain files
!git clone https://bitbucket.org/bopjesvla/thesis.git

Cloning into 'thesis'...
remote: Counting objects: 220, done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 220 (delta 19), reused 0 (delta 0)[K
Receiving objects: 100% (220/220), 734.99 MiB | 14.39 MiB/s, done.
Resolving deltas: 100% (96/96), done.
Checking out files: 100% (58/58), done.


In [2]:
!cp thesis/src/* .

cp: -r not specified; omitting directory 'thesis/src/output'


## Import the required modules

In [0]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

In [4]:
# Read in the data from the pkl file and reset the indices
# in order to get rid of any indices that might be empty
docs = pd.read_pickle('docs.pkl', compression='gzip')
docs = docs.reset_index()
docs

Unnamed: 0,index,game_id,author,content,inserted_at,updated_at,scum,slot_id,words,wc,punc,or_ratio,fpp_ratio,tpp_ratio,but_ratio,token_length,unique_tokens,message_count,message_length,sentence_length,time,wc24h,msg24h,vector_FastText_wiki,vector_GloVe_wiki200d,vector_GloVe_wiki,vector_GloVe_twitter,neg_ratio,neg_em_ratio,not_ratio,anger_ratio,sensory_ratio,cog_ratio,insight_ratio,motion_ratio,tent_ratio,spp_ratio,quant_ratio
0,0,28480,ThAdmiral,yep /!@ \nWhat if he is scum? This option woul...,2013-05-15 01:44:00,2013-07-22 00:17:00,False,240,"[yep, what, if, he, is, scum, this, option, wo...",2688,"[/!@, ?, ', ., :, ', ., /!@, ., ., ., /!@, (, ...",0.003348,0.042039,0.023065,0.006696,4.302455,0.221726,64,42.000000,0.087426,68.939583,38.990662,0.928349,"[-0.11273227513899668, 0.018551131866316774, -...","[0.16055721, 0.19409353, -0.09813492, -0.11749...","[-0.11783043, 0.09800706, -0.05116942, -0.0682...","[0.089912206, 0.19500107, 0.054465882, 0.10074...",0.018601,0.013393,0.018601,0.005952,0.017857,0.081101,0.024926,0.004836,0.029762,0.012649,0.013393
1,1,5845,Khelvaster,"[s]Gah...I don't know anyone. Vote: SPAG, sinc...",2007-08-02 16:33:00,2007-08-27 20:07:00,False,5210,"[s, gah, i, don, t, know, anyone, vote, spag, ...",863,"[[, ], ..., ', ., :, ,, ', .[/, ], ,, ', ..., ...",0.002317,0.042874,0.022016,0.005794,4.067207,0.403244,15,57.533333,0.092700,26.148611,33.003665,0.573644,"[-0.10023690888810728, 0.020833695785571843, -...","[0.14646702, 0.25892192, -0.08927924, -0.17455...","[-0.11812947, 0.10565025, -0.06426784, -0.1109...","[0.0961927, 0.19491331, 0.0321678, 0.071879774...",0.031286,0.006952,0.031286,0.001159,0.016222,0.077636,0.017381,0.006952,0.034762,0.027810,0.015064
2,2,30779,Elyse,/confirm /!@ ANNOUNCEMENT\nThere are NO jester...,2013-08-13 15:28:00,2013-10-11 00:52:00,False,447,"[confirm, announcement, there, are, no, jester...",3194,"[/, /!@, ,, ., ', !, /!@, ., :, /!@, ', /!@, '...",0.001252,0.051659,0.027239,0.008453,3.933626,0.229493,111,28.774775,0.130244,59.391667,53.778588,1.868949,"[-0.09363476607418539, 0.028349871196682513, -...","[0.17497152, 0.21025513, -0.10277278, -0.15136...","[-0.102921024, 0.0764547, -0.06794854, -0.0842...","[0.10085423, 0.16621116, 0.044601005, 0.105791...",0.033500,0.017533,0.033500,0.003444,0.028491,0.100188,0.027552,0.005322,0.022855,0.029430,0.011271
3,3,10744,Simpor,/confirm /!@ Vote: AWA\n\nFor being the last o...,2009-03-04 21:42:00,2009-03-14 21:23:00,True,6238,"[confirm, vote, awa, for, being, the, last, on...",154,"[/, /!@, :, .., /!@, ,, ?, !, /!@, ', ,, ., ',...",0.000000,0.064935,0.012987,0.006494,4.071429,0.649351,5,30.800000,0.103896,10.986806,14.016813,0.455091,"[-0.1328930663044335, 0.0116366885769277, -0.0...","[0.15854017, 0.26437867, -0.03646247, -0.20398...","[-0.16914089, 0.11391469, -0.057118926, -0.110...","[0.12944503, 0.13376419, 0.008241472, 0.091873...",0.025974,0.006494,0.025974,0.006494,0.012987,0.090909,0.019481,0.000000,0.032468,0.012987,0.038961
4,4,10617,blakebowling,"Confirm /!@ Vote: blakebowling /!@ Unvote, Vot...",2009-02-20 02:08:00,2009-03-01 03:10:00,False,6200,"[confirm, vote, blakebowling, unvote, vote, at...",239,"[/!@, :, /!@, ,, :, ., ', ., /!@, /!@, ,, .[/,...",0.000000,0.062762,0.004184,0.000000,3.928870,0.577406,12,19.916667,0.121339,10.043056,23.797538,1.194855,"[-0.11159829518160562, 0.030154899912205217, -...","[0.14067383, 0.29263738, -0.18649453, -0.12949...","[-0.089610584, 0.1020354, -0.11109041, -0.0602...","[0.17050445, 0.18049765, 0.07405566, -0.003876...",0.037657,0.025105,0.037657,0.008368,0.016736,0.050209,0.012552,0.020921,0.020921,0.016736,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9671,10363,34691,Titus,I am here. Not reading all of this tonight. /!...,2014-01-17 05:12:00,2014-02-19 14:44:00,False,2803,"[i, am, here, not, reading, all, of, this, ton...",11236,"[., ., /!@, (, ), ?, ,, ,, ?, (, ., ', .), /!@...",0.003560,0.045390,0.017088,0.006497,3.976059,0.155393,214,52.504673,0.111161,34.397222,326.654284,6.221433,"[-0.09364177068904836, 0.021274735113735566, -...","[0.16734242, 0.19578579, -0.09062915, -0.12811...","[-0.11569925, 0.09404728, -0.06697227, -0.0877...","[0.071309775, 0.17465591, 0.047578953, 0.10952...",0.031150,0.020025,0.031150,0.009879,0.022695,0.083660,0.018067,0.008188,0.020737,0.033108,0.012104
9672,10364,64645,Cabd,Ohai\n\nUNVOTE: \n\nFor now. /!@ Are you payin...,2016-01-18 22:09:00,2016-02-01 13:10:00,False,3235,"[ohai, unvote, for, now, are, you, paying, att...",500,"[:, ., /!@, ?, ., ', ., /!@, ', ., ', ., ., .,...",0.000000,0.040000,0.008000,0.004000,4.058000,0.558000,12,41.666667,0.114000,14.625694,34.186411,0.820474,"[-0.08346330646913729, -0.006562531355826598, ...","[0.15042855, 0.19777678, -0.10134901, -0.14193...","[-0.11541317, 0.06887607, -0.040673744, -0.077...","[0.07772557, 0.14533898, 0.05949125, 0.0480477...",0.016000,0.030000,0.016000,0.014000,0.034000,0.086000,0.022000,0.012000,0.010000,0.022000,0.012000
9673,10365,34691,Prophylaxis,"Hi, replacing in.\n\nShould manage to read the...",2014-01-19 02:22:00,2014-03-22 20:22:00,False,2791,"[hi, replacing, in, should, manage, to, read, ...",8849,"[,, ., .., ~, ?, ., ,, ., /!@, ?, ., /!@, *, *...",0.001695,0.047237,0.022036,0.005085,4.243643,0.153577,78,113.448718,0.062380,63.750000,138.807843,1.223529,"[-0.08553861035061824, 0.0006076766696324315, ...","[0.12958689, 0.21142507, -0.10712522, -0.12166...","[-0.12159943, 0.09617478, -0.051674068, -0.088...","[0.058713168, 0.1332051, 0.08344884, 0.0696835...",0.019098,0.012544,0.019098,0.002599,0.022036,0.071873,0.021471,0.005537,0.021697,0.012431,0.015821
9674,10367,64645,pieguyn,hello\n\nI'm not planning on reading the whole...,2016-02-12 00:23:00,2016-02-21 12:10:00,False,3233,"[hello, i, m, not, planning, on, reading, the,...",6040,"[', ', +, ., ,, (, ), ,, ', /, ., ', ,, ', ., ...",0.003642,0.040397,0.031788,0.005132,3.976821,0.180960,55,109.818182,0.054470,10.490972,575.733104,5.242603,"[-0.08857217908684208, 0.010081575425955981, -...","[0.1583906, 0.1789848, -0.11321375, -0.1563057...","[-0.12236907, 0.11075471, -0.055368353, -0.104...","[0.06731163, 0.18332544, 0.049271382, 0.122296...",0.022517,0.020695,0.022517,0.010265,0.027152,0.108444,0.029470,0.007781,0.029470,0.006457,0.019702


## Instantiate Model

Here, we instantiate a Logistic Regression model that will serve as our baseline and specify the number of splits of the dataset that will be used for the Cross Validation. A 20-fold Cross Validation will be used here. Also, the two classes (Mafia & Not Mafia) are re-weighted to account for the class imbalance.

In [0]:
# Instantiate a model and specify the number of splits of the
# dataset that we will be using. In this case, we will be doing 
# a 20-fold Cross Validation on the dataset. Since

model = LogisticRegression(class_weight = {0: 0.25, 1: 0.75}, max_iter = 150)

kf = KFold(n_splits = 20)

## Comparison of various models

### 1. Hand picked features + FastText

The hand picked features that we will be using here in addition to the FastText Wiki vector are the following:

- Messages / 24 Hours
- 3rd Person Pronoun Ratio
- 2nd Person Pronoun Ratio
- Sentence Length

Of the above 4 features, Sentence Length and 3rd Person Pronoun Ratio have been found to match the effect of detecting deception based on the meta-analysis done by Hauch et al. (2015). Also, it has been seen in Ruiter and Kachergis (2018) that the message frequency is negatively correslated with deception.




In [0]:
# Hand picked features + FastText Wiki
msg24h = docs['msg24h'].values
tpp_ratio = docs['tpp_ratio'].values
spp_ratio = docs['spp_ratio'].values
sentencelength = docs['sentence_length'].values
vectorFastTextwiki = docs['vector_FastText_wiki'].values

# Input
input_combined = np.hstack([sentencelength.reshape(-1, 1), msg24h.reshape(-1, 1), tpp_ratio.reshape(-1, 1), 
                            spp_ratio.reshape(-1, 1), np.vstack(vectorFastTextwiki)])

# Labels
scum = docs['scum'].values

In [66]:
# Combining Sentence Length, Messages / 24 Hours, 3rd-Person Pronoun Ratio,
# 2nd-Person Pronoun Ration and FastText

score_final = 0.0
auroc_final = 0.0
ap_final = 0.0

for train_index, test_index in kf.split(input_combined):
    X_train, X_test = input_combined[train_index], input_combined[test_index]
    Y_train, Y_test = scum[train_index], scum[test_index]

    # Train the model on the Training Dataset
    model.fit(X_train, Y_train)

    # Model makes predictions based on input from the Test Dataset
    predictions = model.predict(X_test)

    # Compute the percentage accuracy of the model's predictions
    score = model.score(X_test, Y_test)

    # Compute the AUROC of the model
    auroc = roc_auc_score(Y_test, predictions)

    # Compute the Average Precision of the model
    average_precision = average_precision_score(Y_test, predictions)

    # Stores the above results so as to obtain the mean performance
    # of the model on the total dataset
    score_final += score
    auroc_final += auroc
    ap_final += average_precision

print("Score:", score_final / 20.0)
print("AUROC:", auroc_final / 20.0)
print("Average Precision:", ap_final / 20.0)

Score: 0.6750688705234158
AUROC: 0.5516012537140085
Average Precision: 0.2564575164794415


### 2. FastText Wiki

Here, we use the standalone FastText Wiki vector in order to compare this model's performance with the Hand-picked Features + FastText Wiki vector to see if there is an improvement

In [70]:
score_final = 0.0
auroc_final = 0.0
ap_final = 0.0

for train_index, test_index in kf.split(vectorFastTextwiki):
    X_train, X_test = vectorFastTextwiki[train_index], vectorFastTextwiki[test_index]
    Y_train, Y_test = scum[train_index], scum[test_index]

    # Train the model on the Training Dataset
    model.fit(X_train.tolist(), Y_train.tolist())

    # Model makes predictions based on input from the Test Dataset
    predictions = model.predict(X_test.tolist())

    # Compute the percentage accuracy of the model's predictions
    score = model.score(X_test.tolist(), Y_test.tolist())

    # Compute the AUROC of the model
    auroc = roc_auc_score(Y_test, predictions)

    # Compute the Average Precision of the model
    average_precision = average_precision_score(Y_test, predictions)

    # Stores the above results so as to obtain the mean performance
    # of the model on the total dataset
    score_final += score
    auroc_final += auroc
    ap_final += average_precision

print("Score:", score_final / 20.0)
print("AUROC:", auroc_final / 20.0)
print("Average Precision:", ap_final / 20.0)

Score: 0.6935670225689989
AUROC: 0.540217315778664
Average Precision: 0.2507544674992502


### 3. GloVe Wiki

We now repeat the above exercise to check if the model with only GloVe vectors performs better than either of the two previous models.

In [0]:
vectorGloVewiki = docs['vector_GloVe_wiki'].values

In [72]:
score_final_2 = 0.0
auroc_final_2 = 0.0
ap_final_2 = 0.0

for train_index, test_index in kf.split(vectorGloVewiki):
    X_train, X_test = vectorGloVewiki[train_index], vectorGloVewiki[test_index]
    Y_train, Y_test = scum[train_index], scum[test_index]

    # Train the model on the Training Dataset
    model.fit(X_train.tolist(), Y_train.tolist())

    # Model makes predictions based on input from the Test Dataset
    predictions = model.predict(X_test.tolist())

    # Compute the percentage accuracy of the model's predictions
    score = model.score(X_test.tolist(), Y_test.tolist())

    # Compute the AUROC of the model
    auroc = roc_auc_score(Y_test.tolist(), predictions)

    # Compute the Average Precision of the model
    average_precision = average_precision_score(Y_test.tolist(), predictions)

    # Stores the above results so as to obtain the mean performance
    # of the model on the total dataset
    score_final_2 += score
    auroc_final_2 += auroc
    ap_final_2 += average_precision

print("Score:", score_final_2 / 20.0)
print("AUROC:", auroc_final_2 / 20.0)
print("Average Precision:", ap_final_2 / 20.0)

Score: 0.6612224731789949
AUROC: 0.5413354303896216
Average Precision: 0.25052015683962137


## References

- Hauch, V., Blandon-Gitlin, I., Masip, J. and Sporer, S.L., 2015. Are computers effective lie detectors? A meta-analysis of linguistic cues to deception. *Personality and social psychology Review, 19*(4), pp. 307-342.

- de Ruiter, B. and Kachergis, G., 2018. The Mafiascum Dataset: A Large Text Corpus for Deception Detection. *arXiv preprint arXiv:1811.07851.*